From d0fc088b6b9a4226ba130e97bcbb5cb326cb738a Mon Sep 17 00:00:00 2001 From: Michael <57787676+baentsch@users.noreply.github.com> Date: Mon, 22 Feb 2021 11:47:03 +0100 Subject: [PATCH 1/6] fixing copy_from_upstream issues --- .../copy_from_upstream/copy_from_upstream.py | 21 +++++++++++-------- .../copy_from_upstream/copy_from_upstream.yml | 20 ------------------ 2 files changed, 12 insertions(+), 29 deletions(-) diff --git a/scripts/copy_from_upstream/copy_from_upstream.py b/scripts/copy_from_upstream/copy_from_upstream.py index 77cfd77a05..28274dde0a 100755 --- a/scripts/copy_from_upstream/copy_from_upstream.py +++ b/scripts/copy_from_upstream/copy_from_upstream.py @@ -78,7 +78,7 @@ def replacer(filename, instructions, delimiter): file_put_contents(os.path.join(os.environ['LIBOQS_DIR'], filename), contents) def load_instructions(): - subprocess_stdout = subprocess.STDOUT if DEBUG > 0 else subprocess.DEVNULL + subprocess_stdout = None if DEBUG > 0 else subprocess.DEVNULL instructions = file_get_contents( os.path.join(os.environ['LIBOQS_DIR'], 'scripts', 'copy_from_upstream', 'copy_from_upstream.yml'), encoding='utf-8') @@ -203,13 +203,7 @@ def load_instructions(): scheme['pqclean_scheme_c'] = scheme['pqclean_scheme'].replace('-', '') scheme['scheme_c'] = scheme['scheme'].replace('-', '') scheme['default_implementation'] = family['default_implementation'] - # This is a temporary hack to work around the fact that - # the PQClean's META.ymls for the Dilithium AVX2 variants - # are not properly specified. - if scheme['pretty_name_full'].startswith('DILITHIUM_') and scheme['upstream_location'] == "pqclean": - scheme['metadata']['implementations'][1]['supported_platforms'][0]['operating_systems'] = ['Linux'] - scheme['metadata']['implementations'][1]['supported_platforms'][0]['required_flags'] = ['avx2', 'bmi1', - 'popcnt'] + for impl in scheme['metadata']['implementations']: if 'common_dep' in impl: cdeps_names = impl['common_dep'].split(" ") @@ -376,7 +370,7 @@ def process_families(instructions, basedir, with_kat, with_generator): srcs = handle_implementation(impl, family, scheme, basedir) if DEBUG > 3: print("SRCs found: %s" % (srcs)) - if (scheme['sources']): + if ('sources' in scheme): assert (len(scheme['sources']) == len(srcs)) # in any case: add 'sources' to implementation(s) # Only retain this 1 implementation: @@ -385,6 +379,14 @@ def process_families(instructions, basedir, with_kat, with_generator): scheme['metadata']['implementations'][0]['sources'] = srcs else: # If no scheme['implementation'] given, get the list from META.yml and add all implementations + # our code generator logic assumes only one default and one optimized implementation + # so, for mceliece, kill off "clean" and "sse" implementations until this is fixed TBD + if family['name'] == "classic_mceliece": + mceimps = [] + for i in scheme['metadata']['implementations']: + if i['name'] != "sse" and i['name'] != "clean": + mceimps.append(i) + scheme['metadata']['implementations'] = mceimps for impl in scheme['metadata']['implementations']: srcs = handle_implementation(impl['name'], family, scheme, basedir) if DEBUG > 2: @@ -402,6 +404,7 @@ def process_families(instructions, basedir, with_kat, with_generator): scheme['scheme'], str(ke), impl['name'])) pass + if with_kat: if family in instructions['kems']: try: diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml index 1e14da070d..fccc83303c 100644 --- a/scripts/copy_from_upstream/copy_from_upstream.yml +++ b/scripts/copy_from_upstream/copy_from_upstream.yml @@ -34,62 +34,42 @@ kems: scheme: "348864" pqclean_scheme: mceliece348864 pretty_name_full: Classic-McEliece-348864 - implementation: vec - sources: ['aes256ctr.c', 'benes.c', 'bm.c', 'controlbits.c', 'decrypt.c', 'encrypt.c', 'fft.c', 'fft_tr.c', 'gf.c', 'operations.c', 'pk_gen.c', 'sk_gen.c', 'transpose.c', 'util.c', 'vec.c'] - scheme: "348864f" pqclean_scheme: mceliece348864f pretty_name_full: Classic-McEliece-348864f - implementation: vec - sources: ['aes256ctr.c', 'benes.c', 'bm.c', 'controlbits.c', 'decrypt.c', 'encrypt.c', 'fft.c', 'fft_tr.c', 'gf.c', 'operations.c', 'pk_gen.c', 'sk_gen.c', 'transpose.c', 'util.c', 'vec.c'] - scheme: "460896" pqclean_scheme: mceliece460896 pretty_name_full: Classic-McEliece-460896 - implementation: vec - sources: ['aes256ctr.c', 'benes.c', 'bm.c', 'controlbits.c', 'decrypt.c', 'encrypt.c', 'fft.c', 'fft_tr.c', 'gf.c', 'operations.c', 'pk_gen.c', 'sk_gen.c', 'transpose.c', 'util.c', 'vec.c'] - scheme: "460896f" pqclean_scheme: mceliece460896f pretty_name_full: Classic-McEliece-460896f - implementation: vec - sources: ['aes256ctr.c', 'benes.c', 'bm.c', 'controlbits.c', 'decrypt.c', 'encrypt.c', 'fft.c', 'fft_tr.c', 'gf.c', 'operations.c', 'pk_gen.c', 'sk_gen.c', 'transpose.c', 'util.c', 'vec.c'] - scheme: "6688128" pqclean_scheme: mceliece6688128 pretty_name_full: Classic-McEliece-6688128 - implementation: vec - sources: ['aes256ctr.c', 'benes.c', 'bm.c', 'controlbits.c', 'decrypt.c', 'encrypt.c', 'fft.c', 'fft_tr.c', 'gf.c', 'operations.c', 'pk_gen.c', 'sk_gen.c', 'transpose.c', 'util.c', 'vec.c'] - scheme: "6688128f" pqclean_scheme: mceliece6688128f pretty_name_full: Classic-McEliece-6688128f - implementation: vec - sources: ['aes256ctr.c', 'benes.c', 'bm.c', 'controlbits.c', 'decrypt.c', 'encrypt.c', 'fft.c', 'fft_tr.c', 'gf.c', 'operations.c', 'pk_gen.c', 'sk_gen.c', 'transpose.c', 'util.c', 'vec.c'] - scheme: "6960119" pqclean_scheme: mceliece6960119 pretty_name_full: Classic-McEliece-6960119 - implementation: vec - sources: ['aes256ctr.c', 'benes.c', 'bm.c', 'controlbits.c', 'decrypt.c', 'encrypt.c', 'fft.c', 'fft_tr.c', 'gf.c', 'operations.c', 'pk_gen.c', 'sk_gen.c', 'transpose.c', 'util.c', 'vec.c'] - scheme: "6960119f" pqclean_scheme: mceliece6960119f pretty_name_full: Classic-McEliece-6960119f - implementation: vec - sources: ['aes256ctr.c', 'benes.c', 'bm.c', 'controlbits.c', 'decrypt.c', 'encrypt.c', 'fft.c', 'fft_tr.c', 'gf.c', 'operations.c', 'pk_gen.c', 'sk_gen.c', 'transpose.c', 'util.c', 'vec.c'] - scheme: "8192128" pqclean_scheme: mceliece8192128 pretty_name_full: Classic-McEliece-8192128 - implementation: vec - sources: ['aes256ctr.c', 'benes.c', 'bm.c', 'controlbits.c', 'decrypt.c', 'encrypt.c', 'fft.c', 'fft_tr.c', 'gf.c', 'operations.c', 'pk_gen.c', 'sk_gen.c', 'transpose.c', 'util.c', 'vec.c'] - scheme: "8192128f" pqclean_scheme: mceliece8192128f pretty_name_full: Classic-McEliece-8192128f - implementation: vec - sources: ['aes256ctr.c', 'benes.c', 'bm.c', 'controlbits.c', 'decrypt.c', 'encrypt.c', 'fft.c', 'fft_tr.c', 'gf.c', 'operations.c', 'pk_gen.c', 'sk_gen.c', 'transpose.c', 'util.c', 'vec.c'] - name: hqc default_implementation: clean From ad907f9a946e82fa845c5054dd2843b60b2f104e Mon Sep 17 00:00:00 2001 From: Michael <57787676+baentsch@users.noreply.github.com> Date: Mon, 22 Feb 2021 12:13:29 +0100 Subject: [PATCH 2/6] adding mceliece-avx & falcon --- .CMake/alg_support.cmake | 30 + .../copy_from_upstream/copy_from_upstream.yml | 2 +- src/kem/classic_mceliece/CMakeLists.txt | 80 + .../kem_classic_mceliece_348864.c | 45 + .../kem_classic_mceliece_348864f.c | 45 + .../kem_classic_mceliece_460896.c | 45 + .../kem_classic_mceliece_460896f.c | 45 + .../kem_classic_mceliece_6688128.c | 45 + .../kem_classic_mceliece_6688128f.c | 45 + .../kem_classic_mceliece_6960119.c | 45 + .../kem_classic_mceliece_6960119f.c | 45 + .../kem_classic_mceliece_8192128.c | 45 + .../kem_classic_mceliece_8192128f.c | 45 + .../pqclean_mceliece348864_avx/LICENSE | 16 + .../pqclean_mceliece348864_avx/aes256ctr.c | 13 + .../pqclean_mceliece348864_avx/aes256ctr.h | 17 + .../pqclean_mceliece348864_avx/api.h | 32 + .../pqclean_mceliece348864_avx/benes.c | 287 + .../pqclean_mceliece348864_avx/benes.h | 15 + .../pqclean_mceliece348864_avx/bm.c | 219 + .../pqclean_mceliece348864_avx/bm.h | 14 + .../pqclean_mceliece348864_avx/consts.S | 33 + .../pqclean_mceliece348864_avx/consts.inc | 238 + .../pqclean_mceliece348864_avx/controlbits.c | 274 + .../pqclean_mceliece348864_avx/controlbits.h | 15 + .../pqclean_mceliece348864_avx/crypto_hash.h | 7 + .../pqclean_mceliece348864_avx/decrypt.c | 234 + .../pqclean_mceliece348864_avx/decrypt.h | 10 + .../pqclean_mceliece348864_avx/encrypt.c | 99 + .../pqclean_mceliece348864_avx/encrypt.h | 11 + .../pqclean_mceliece348864_avx/fft.c | 172 + .../pqclean_mceliece348864_avx/fft.h | 18 + .../pqclean_mceliece348864_avx/fft_tr.c | 355 + .../pqclean_mceliece348864_avx/fft_tr.h | 14 + .../pqclean_mceliece348864_avx/gf.c | 169 + .../pqclean_mceliece348864_avx/gf.h | 26 + .../pqclean_mceliece348864_avx/int32_sort.c | 1211 +++ .../pqclean_mceliece348864_avx/int32_sort.h | 9 + .../pqclean_mceliece348864_avx/operations.c | 136 + .../pqclean_mceliece348864_avx/params.h | 21 + .../pqclean_mceliece348864_avx/pk_gen.c | 276 + .../pqclean_mceliece348864_avx/pk_gen.h | 13 + .../pqclean_mceliece348864_avx/powers.inc | 224 + .../pqclean_mceliece348864_avx/scalars.inc | 70 + .../pqclean_mceliece348864_avx/scalars_2x.inc | 70 + .../pqclean_mceliece348864_avx/sk_gen.c | 98 + .../pqclean_mceliece348864_avx/sk_gen.h | 16 + .../pqclean_mceliece348864_avx/syndrome_asm.S | 530 ++ .../pqclean_mceliece348864_avx/transpose.c | 17 + .../pqclean_mceliece348864_avx/transpose.h | 17 + .../transpose_64x256_sp_asm.S | 8145 ++++++++++++++++ .../transpose_64x64_asm.S | 8467 +++++++++++++++++ .../pqclean_mceliece348864_avx/uint32_sort.c | 18 + .../pqclean_mceliece348864_avx/uint32_sort.h | 9 + .../pqclean_mceliece348864_avx/update_asm.S | 354 + .../pqclean_mceliece348864_avx/util.c | 106 + .../pqclean_mceliece348864_avx/util.h | 33 + .../pqclean_mceliece348864_avx/vec.c | 25 + .../pqclean_mceliece348864_avx/vec.h | 13 + .../pqclean_mceliece348864_avx/vec128.c | 83 + .../pqclean_mceliece348864_avx/vec128.h | 41 + .../vec128_mul_asm.S | 1369 +++ .../pqclean_mceliece348864_avx/vec256.c | 137 + .../pqclean_mceliece348864_avx/vec256.h | 45 + .../vec256_mul_asm.S | 1736 ++++ .../pqclean_mceliece348864_avx/vec_mul_asm.S | 1106 +++ .../vec_mul_sp_asm.S | 1115 +++ .../vec_reduce_asm.S | 356 + .../pqclean_mceliece348864_clean/LICENSE | 16 + .../pqclean_mceliece348864_clean/aes256ctr.c | 13 + .../pqclean_mceliece348864_clean/aes256ctr.h | 17 + .../pqclean_mceliece348864_clean/api.h | 32 + .../pqclean_mceliece348864_clean/benes.c | 139 + .../pqclean_mceliece348864_clean/benes.h | 14 + .../pqclean_mceliece348864_clean/bm.c | 83 + .../pqclean_mceliece348864_clean/bm.h | 13 + .../controlbits.c | 274 + .../controlbits.h | 15 + .../crypto_hash.h | 7 + .../pqclean_mceliece348864_clean/decrypt.c | 90 + .../pqclean_mceliece348864_clean/decrypt.h | 10 + .../pqclean_mceliece348864_clean/encrypt.c | 138 + .../pqclean_mceliece348864_clean/encrypt.h | 11 + .../pqclean_mceliece348864_clean/gf.c | 139 + .../pqclean_mceliece348864_clean/gf.h | 22 + .../pqclean_mceliece348864_clean/operations.c | 136 + .../pqclean_mceliece348864_clean/params.h | 21 + .../pqclean_mceliece348864_clean/pk_gen.c | 144 + .../pqclean_mceliece348864_clean/pk_gen.h | 13 + .../pqclean_mceliece348864_clean/root.c | 33 + .../pqclean_mceliece348864_clean/root.h | 14 + .../pqclean_mceliece348864_clean/sk_gen.c | 98 + .../pqclean_mceliece348864_clean/sk_gen.h | 16 + .../pqclean_mceliece348864_clean/synd.c | 33 + .../pqclean_mceliece348864_clean/synd.h | 12 + .../pqclean_mceliece348864_clean/transpose.c | 42 + .../pqclean_mceliece348864_clean/transpose.h | 13 + .../pqclean_mceliece348864_clean/util.c | 67 + .../pqclean_mceliece348864_clean/util.h | 22 + .../pqclean_mceliece348864_sse/LICENSE | 16 + .../pqclean_mceliece348864_sse/aes256ctr.c | 13 + .../pqclean_mceliece348864_sse/aes256ctr.h | 17 + .../pqclean_mceliece348864_sse/api.h | 32 + .../pqclean_mceliece348864_sse/benes.c | 287 + .../pqclean_mceliece348864_sse/benes.h | 15 + .../pqclean_mceliece348864_sse/bm.c | 220 + .../pqclean_mceliece348864_sse/bm.h | 17 + .../pqclean_mceliece348864_sse/consts.S | 32 + .../pqclean_mceliece348864_sse/consts.inc | 448 + .../pqclean_mceliece348864_sse/controlbits.c | 274 + .../pqclean_mceliece348864_sse/controlbits.h | 15 + .../pqclean_mceliece348864_sse/crypto_hash.h | 7 + .../pqclean_mceliece348864_sse/decrypt.c | 203 + .../pqclean_mceliece348864_sse/decrypt.h | 10 + .../pqclean_mceliece348864_sse/encrypt.c | 99 + .../pqclean_mceliece348864_sse/encrypt.h | 11 + .../pqclean_mceliece348864_sse/fft.c | 155 + .../pqclean_mceliece348864_sse/fft.h | 17 + .../pqclean_mceliece348864_sse/fft_tr.c | 312 + .../pqclean_mceliece348864_sse/fft_tr.h | 13 + .../pqclean_mceliece348864_sse/gf.c | 169 + .../pqclean_mceliece348864_sse/gf.h | 26 + .../pqclean_mceliece348864_sse/operations.c | 136 + .../pqclean_mceliece348864_sse/params.h | 21 + .../pqclean_mceliece348864_sse/pk_gen.c | 329 + .../pqclean_mceliece348864_sse/pk_gen.h | 13 + .../pqclean_mceliece348864_sse/powers.inc | 448 + .../pqclean_mceliece348864_sse/scalars.inc | 70 + .../pqclean_mceliece348864_sse/scalars_2x.inc | 70 + .../pqclean_mceliece348864_sse/sk_gen.c | 98 + .../pqclean_mceliece348864_sse/sk_gen.h | 16 + .../pqclean_mceliece348864_sse/syndrome_asm.S | 740 ++ .../pqclean_mceliece348864_sse/transpose.c | 12 + .../pqclean_mceliece348864_sse/transpose.h | 16 + .../transpose_64x128_sp_asm.S | 8145 ++++++++++++++++ .../transpose_64x64_asm.S | 8467 +++++++++++++++++ .../pqclean_mceliece348864_sse/update_asm.S | 354 + .../pqclean_mceliece348864_sse/util.c | 106 + .../pqclean_mceliece348864_sse/util.h | 33 + .../pqclean_mceliece348864_sse/vec.c | 17 + .../pqclean_mceliece348864_sse/vec.h | 11 + .../pqclean_mceliece348864_sse/vec128.c | 143 + .../pqclean_mceliece348864_sse/vec128.h | 42 + .../vec128_mul_asm.S | 1736 ++++ .../pqclean_mceliece348864_sse/vec_mul_asm.S | 1515 +++ .../vec_reduce_asm.S | 356 + .../pqclean_mceliece348864f_avx/LICENSE | 16 + .../pqclean_mceliece348864f_avx/aes256ctr.c | 13 + .../pqclean_mceliece348864f_avx/aes256ctr.h | 17 + .../pqclean_mceliece348864f_avx/api.h | 33 + .../pqclean_mceliece348864f_avx/benes.c | 287 + .../pqclean_mceliece348864f_avx/benes.h | 15 + .../pqclean_mceliece348864f_avx/bm.c | 219 + .../pqclean_mceliece348864f_avx/bm.h | 14 + .../pqclean_mceliece348864f_avx/consts.S | 33 + .../pqclean_mceliece348864f_avx/consts.inc | 238 + .../pqclean_mceliece348864f_avx/controlbits.c | 274 + .../pqclean_mceliece348864f_avx/controlbits.h | 15 + .../pqclean_mceliece348864f_avx/crypto_hash.h | 7 + .../pqclean_mceliece348864f_avx/decrypt.c | 234 + .../pqclean_mceliece348864f_avx/decrypt.h | 10 + .../pqclean_mceliece348864f_avx/encrypt.c | 99 + .../pqclean_mceliece348864f_avx/encrypt.h | 11 + .../pqclean_mceliece348864f_avx/fft.c | 172 + .../pqclean_mceliece348864f_avx/fft.h | 18 + .../pqclean_mceliece348864f_avx/fft_tr.c | 355 + .../pqclean_mceliece348864f_avx/fft_tr.h | 14 + .../pqclean_mceliece348864f_avx/gf.c | 169 + .../pqclean_mceliece348864f_avx/gf.h | 26 + .../pqclean_mceliece348864f_avx/int32_sort.c | 1211 +++ .../pqclean_mceliece348864f_avx/int32_sort.h | 9 + .../pqclean_mceliece348864f_avx/operations.c | 136 + .../pqclean_mceliece348864f_avx/params.h | 21 + .../pqclean_mceliece348864f_avx/pk_gen.c | 329 + .../pqclean_mceliece348864f_avx/pk_gen.h | 13 + .../pqclean_mceliece348864f_avx/powers.inc | 224 + .../pqclean_mceliece348864f_avx/scalars.inc | 70 + .../scalars_2x.inc | 70 + .../pqclean_mceliece348864f_avx/sk_gen.c | 98 + .../pqclean_mceliece348864f_avx/sk_gen.h | 16 + .../syndrome_asm.S | 530 ++ .../pqclean_mceliece348864f_avx/transpose.c | 17 + .../pqclean_mceliece348864f_avx/transpose.h | 17 + .../transpose_64x256_sp_asm.S | 8145 ++++++++++++++++ .../transpose_64x64_asm.S | 8467 +++++++++++++++++ .../pqclean_mceliece348864f_avx/uint32_sort.c | 18 + .../pqclean_mceliece348864f_avx/uint32_sort.h | 9 + .../pqclean_mceliece348864f_avx/update_asm.S | 354 + .../pqclean_mceliece348864f_avx/util.c | 106 + .../pqclean_mceliece348864f_avx/util.h | 33 + .../pqclean_mceliece348864f_avx/vec.c | 25 + .../pqclean_mceliece348864f_avx/vec.h | 13 + .../pqclean_mceliece348864f_avx/vec128.c | 83 + .../pqclean_mceliece348864f_avx/vec128.h | 41 + .../vec128_mul_asm.S | 1369 +++ .../pqclean_mceliece348864f_avx/vec256.c | 137 + .../pqclean_mceliece348864f_avx/vec256.h | 45 + .../vec256_mul_asm.S | 1736 ++++ .../pqclean_mceliece348864f_avx/vec_mul_asm.S | 1106 +++ .../vec_mul_sp_asm.S | 1115 +++ .../vec_reduce_asm.S | 356 + .../pqclean_mceliece460896_avx/LICENSE | 16 + .../pqclean_mceliece460896_avx/aes256ctr.c | 13 + .../pqclean_mceliece460896_avx/aes256ctr.h | 17 + .../pqclean_mceliece460896_avx/api.h | 31 + .../pqclean_mceliece460896_avx/benes.c | 311 + .../pqclean_mceliece460896_avx/benes.h | 14 + .../pqclean_mceliece460896_avx/bm.c | 215 + .../pqclean_mceliece460896_avx/bm.h | 14 + .../pqclean_mceliece460896_avx/consts.S | 33 + .../pqclean_mceliece460896_avx/consts.inc | 502 + .../pqclean_mceliece460896_avx/controlbits.c | 274 + .../pqclean_mceliece460896_avx/controlbits.h | 15 + .../pqclean_mceliece460896_avx/crypto_hash.h | 7 + .../pqclean_mceliece460896_avx/decrypt.c | 234 + .../pqclean_mceliece460896_avx/decrypt.h | 10 + .../pqclean_mceliece460896_avx/encrypt.c | 99 + .../pqclean_mceliece460896_avx/encrypt.h | 11 + .../pqclean_mceliece460896_avx/fft.c | 262 + .../pqclean_mceliece460896_avx/fft.h | 17 + .../pqclean_mceliece460896_avx/fft_tr.c | 398 + .../pqclean_mceliece460896_avx/fft_tr.h | 14 + .../pqclean_mceliece460896_avx/gf.c | 205 + .../pqclean_mceliece460896_avx/gf.h | 22 + .../pqclean_mceliece460896_avx/int32_sort.c | 1211 +++ .../pqclean_mceliece460896_avx/int32_sort.h | 9 + .../pqclean_mceliece460896_avx/operations.c | 136 + .../pqclean_mceliece460896_avx/params.h | 21 + .../pqclean_mceliece460896_avx/pk_gen.c | 290 + .../pqclean_mceliece460896_avx/pk_gen.h | 12 + .../pqclean_mceliece460896_avx/scalars_2x.inc | 75 + .../pqclean_mceliece460896_avx/scalars_4x.inc | 91 + .../pqclean_mceliece460896_avx/sk_gen.c | 98 + .../pqclean_mceliece460896_avx/sk_gen.h | 16 + .../pqclean_mceliece460896_avx/syndrome_asm.S | 650 ++ .../pqclean_mceliece460896_avx/transpose.c | 18 + .../pqclean_mceliece460896_avx/transpose.h | 16 + .../transpose_64x128_sp_asm.S | 8145 ++++++++++++++++ .../transpose_64x256_sp_asm.S | 8145 ++++++++++++++++ .../pqclean_mceliece460896_avx/uint32_sort.c | 18 + .../pqclean_mceliece460896_avx/uint32_sort.h | 9 + .../pqclean_mceliece460896_avx/update_asm.S | 576 ++ .../pqclean_mceliece460896_avx/util.c | 106 + .../pqclean_mceliece460896_avx/util.h | 23 + .../pqclean_mceliece460896_avx/vec128.c | 83 + .../pqclean_mceliece460896_avx/vec128.h | 41 + .../vec128_mul_asm.S | 1816 ++++ .../pqclean_mceliece460896_avx/vec256.c | 146 + .../pqclean_mceliece460896_avx/vec256.h | 45 + .../vec256_ama_asm.S | 2322 +++++ .../vec256_maa_asm.S | 2322 +++++ .../vec256_mul_asm.S | 2127 +++++ .../vec_reduce_asm.S | 513 + .../pqclean_mceliece460896f_avx/LICENSE | 16 + .../pqclean_mceliece460896f_avx/aes256ctr.c | 13 + .../pqclean_mceliece460896f_avx/aes256ctr.h | 17 + .../pqclean_mceliece460896f_avx/api.h | 31 + .../pqclean_mceliece460896f_avx/benes.c | 311 + .../pqclean_mceliece460896f_avx/benes.h | 14 + .../pqclean_mceliece460896f_avx/bm.c | 215 + .../pqclean_mceliece460896f_avx/bm.h | 14 + .../pqclean_mceliece460896f_avx/consts.S | 33 + .../pqclean_mceliece460896f_avx/consts.inc | 502 + .../pqclean_mceliece460896f_avx/controlbits.c | 274 + .../pqclean_mceliece460896f_avx/controlbits.h | 15 + .../pqclean_mceliece460896f_avx/crypto_hash.h | 7 + .../pqclean_mceliece460896f_avx/decrypt.c | 234 + .../pqclean_mceliece460896f_avx/decrypt.h | 10 + .../pqclean_mceliece460896f_avx/encrypt.c | 99 + .../pqclean_mceliece460896f_avx/encrypt.h | 11 + .../pqclean_mceliece460896f_avx/fft.c | 262 + .../pqclean_mceliece460896f_avx/fft.h | 17 + .../pqclean_mceliece460896f_avx/fft_tr.c | 398 + .../pqclean_mceliece460896f_avx/fft_tr.h | 14 + .../pqclean_mceliece460896f_avx/gf.c | 205 + .../pqclean_mceliece460896f_avx/gf.h | 22 + .../pqclean_mceliece460896f_avx/int32_sort.c | 1211 +++ .../pqclean_mceliece460896f_avx/int32_sort.h | 9 + .../pqclean_mceliece460896f_avx/operations.c | 136 + .../pqclean_mceliece460896f_avx/params.h | 21 + .../pqclean_mceliece460896f_avx/pk_gen.c | 358 + .../pqclean_mceliece460896f_avx/pk_gen.h | 12 + .../scalars_2x.inc | 75 + .../scalars_4x.inc | 91 + .../pqclean_mceliece460896f_avx/sk_gen.c | 98 + .../pqclean_mceliece460896f_avx/sk_gen.h | 16 + .../syndrome_asm.S | 650 ++ .../pqclean_mceliece460896f_avx/transpose.c | 18 + .../pqclean_mceliece460896f_avx/transpose.h | 16 + .../transpose_64x128_sp_asm.S | 8145 ++++++++++++++++ .../transpose_64x256_sp_asm.S | 8145 ++++++++++++++++ .../pqclean_mceliece460896f_avx/uint32_sort.c | 18 + .../pqclean_mceliece460896f_avx/uint32_sort.h | 9 + .../pqclean_mceliece460896f_avx/update_asm.S | 576 ++ .../pqclean_mceliece460896f_avx/util.c | 106 + .../pqclean_mceliece460896f_avx/util.h | 23 + .../pqclean_mceliece460896f_avx/vec128.c | 79 + .../pqclean_mceliece460896f_avx/vec128.h | 41 + .../vec128_mul_asm.S | 1816 ++++ .../pqclean_mceliece460896f_avx/vec256.c | 146 + .../pqclean_mceliece460896f_avx/vec256.h | 45 + .../vec256_ama_asm.S | 2322 +++++ .../vec256_maa_asm.S | 2322 +++++ .../vec256_mul_asm.S | 2127 +++++ .../vec_reduce_asm.S | 513 + .../pqclean_mceliece6688128_avx/LICENSE | 16 + .../pqclean_mceliece6688128_avx/aes256ctr.c | 13 + .../pqclean_mceliece6688128_avx/aes256ctr.h | 17 + .../pqclean_mceliece6688128_avx/api.h | 32 + .../pqclean_mceliece6688128_avx/benes.c | 311 + .../pqclean_mceliece6688128_avx/benes.h | 14 + .../pqclean_mceliece6688128_avx/bm.c | 219 + .../pqclean_mceliece6688128_avx/bm.h | 14 + .../pqclean_mceliece6688128_avx/consts.S | 33 + .../pqclean_mceliece6688128_avx/consts.inc | 502 + .../pqclean_mceliece6688128_avx/controlbits.c | 274 + .../pqclean_mceliece6688128_avx/controlbits.h | 15 + .../pqclean_mceliece6688128_avx/crypto_hash.h | 7 + .../pqclean_mceliece6688128_avx/decrypt.c | 234 + .../pqclean_mceliece6688128_avx/decrypt.h | 10 + .../pqclean_mceliece6688128_avx/encrypt.c | 104 + .../pqclean_mceliece6688128_avx/encrypt.h | 11 + .../pqclean_mceliece6688128_avx/fft.c | 275 + .../pqclean_mceliece6688128_avx/fft.h | 17 + .../pqclean_mceliece6688128_avx/fft_tr.c | 379 + .../pqclean_mceliece6688128_avx/fft_tr.h | 14 + .../pqclean_mceliece6688128_avx/gf.c | 236 + .../pqclean_mceliece6688128_avx/gf.h | 21 + .../pqclean_mceliece6688128_avx/int32_sort.c | 1211 +++ .../pqclean_mceliece6688128_avx/int32_sort.h | 9 + .../pqclean_mceliece6688128_avx/operations.c | 136 + .../pqclean_mceliece6688128_avx/params.h | 21 + .../pqclean_mceliece6688128_avx/pk_gen.c | 286 + .../pqclean_mceliece6688128_avx/pk_gen.h | 12 + .../pqclean_mceliece6688128_avx/powers.inc | 480 + .../scalars_2x.inc | 75 + .../scalars_4x.inc | 91 + .../pqclean_mceliece6688128_avx/sk_gen.c | 98 + .../pqclean_mceliece6688128_avx/sk_gen.h | 16 + .../syndrome_asm.S | 810 ++ .../pqclean_mceliece6688128_avx/transpose.c | 18 + .../pqclean_mceliece6688128_avx/transpose.h | 16 + .../transpose_64x128_sp_asm.S | 8145 ++++++++++++++++ .../transpose_64x256_sp_asm.S | 8145 ++++++++++++++++ .../pqclean_mceliece6688128_avx/uint32_sort.c | 18 + .../pqclean_mceliece6688128_avx/uint32_sort.h | 9 + .../pqclean_mceliece6688128_avx/update_asm.S | 576 ++ .../pqclean_mceliece6688128_avx/util.c | 106 + .../pqclean_mceliece6688128_avx/util.h | 23 + .../pqclean_mceliece6688128_avx/vec128.c | 79 + .../pqclean_mceliece6688128_avx/vec128.h | 41 + .../vec128_mul_asm.S | 1816 ++++ .../pqclean_mceliece6688128_avx/vec256.c | 146 + .../pqclean_mceliece6688128_avx/vec256.h | 45 + .../vec256_ama_asm.S | 2322 +++++ .../vec256_maa_asm.S | 2322 +++++ .../vec256_mul_asm.S | 2127 +++++ .../vec_reduce_asm.S | 513 + .../pqclean_mceliece6688128f_avx/LICENSE | 16 + .../pqclean_mceliece6688128f_avx/aes256ctr.c | 13 + .../pqclean_mceliece6688128f_avx/aes256ctr.h | 17 + .../pqclean_mceliece6688128f_avx/api.h | 32 + .../pqclean_mceliece6688128f_avx/benes.c | 311 + .../pqclean_mceliece6688128f_avx/benes.h | 14 + .../pqclean_mceliece6688128f_avx/bm.c | 219 + .../pqclean_mceliece6688128f_avx/bm.h | 14 + .../pqclean_mceliece6688128f_avx/consts.S | 33 + .../pqclean_mceliece6688128f_avx/consts.inc | 502 + .../controlbits.c | 274 + .../controlbits.h | 15 + .../crypto_hash.h | 7 + .../pqclean_mceliece6688128f_avx/decrypt.c | 234 + .../pqclean_mceliece6688128f_avx/decrypt.h | 10 + .../pqclean_mceliece6688128f_avx/encrypt.c | 104 + .../pqclean_mceliece6688128f_avx/encrypt.h | 11 + .../pqclean_mceliece6688128f_avx/fft.c | 275 + .../pqclean_mceliece6688128f_avx/fft.h | 17 + .../pqclean_mceliece6688128f_avx/fft_tr.c | 379 + .../pqclean_mceliece6688128f_avx/fft_tr.h | 14 + .../pqclean_mceliece6688128f_avx/gf.c | 236 + .../pqclean_mceliece6688128f_avx/gf.h | 21 + .../pqclean_mceliece6688128f_avx/int32_sort.c | 1211 +++ .../pqclean_mceliece6688128f_avx/int32_sort.h | 9 + .../pqclean_mceliece6688128f_avx/operations.c | 136 + .../pqclean_mceliece6688128f_avx/params.h | 21 + .../pqclean_mceliece6688128f_avx/pk_gen.c | 360 + .../pqclean_mceliece6688128f_avx/pk_gen.h | 12 + .../pqclean_mceliece6688128f_avx/powers.inc | 480 + .../scalars_2x.inc | 75 + .../scalars_4x.inc | 91 + .../pqclean_mceliece6688128f_avx/sk_gen.c | 98 + .../pqclean_mceliece6688128f_avx/sk_gen.h | 16 + .../syndrome_asm.S | 810 ++ .../pqclean_mceliece6688128f_avx/transpose.c | 18 + .../pqclean_mceliece6688128f_avx/transpose.h | 16 + .../transpose_64x128_sp_asm.S | 8145 ++++++++++++++++ .../transpose_64x256_sp_asm.S | 8145 ++++++++++++++++ .../uint32_sort.c | 18 + .../uint32_sort.h | 9 + .../pqclean_mceliece6688128f_avx/update_asm.S | 576 ++ .../pqclean_mceliece6688128f_avx/util.c | 106 + .../pqclean_mceliece6688128f_avx/util.h | 23 + .../pqclean_mceliece6688128f_avx/vec128.c | 79 + .../pqclean_mceliece6688128f_avx/vec128.h | 41 + .../vec128_mul_asm.S | 1816 ++++ .../pqclean_mceliece6688128f_avx/vec256.c | 146 + .../pqclean_mceliece6688128f_avx/vec256.h | 45 + .../vec256_ama_asm.S | 2322 +++++ .../vec256_maa_asm.S | 2322 +++++ .../vec256_mul_asm.S | 2127 +++++ .../vec_reduce_asm.S | 513 + .../pqclean_mceliece6960119_avx/LICENSE | 16 + .../pqclean_mceliece6960119_avx/aes256ctr.c | 13 + .../pqclean_mceliece6960119_avx/aes256ctr.h | 17 + .../pqclean_mceliece6960119_avx/api.h | 32 + .../pqclean_mceliece6960119_avx/benes.c | 311 + .../pqclean_mceliece6960119_avx/benes.h | 14 + .../pqclean_mceliece6960119_avx/bm.c | 215 + .../pqclean_mceliece6960119_avx/bm.h | 14 + .../pqclean_mceliece6960119_avx/consts.S | 33 + .../pqclean_mceliece6960119_avx/consts.inc | 502 + .../pqclean_mceliece6960119_avx/controlbits.c | 274 + .../pqclean_mceliece6960119_avx/controlbits.h | 15 + .../pqclean_mceliece6960119_avx/crypto_hash.h | 7 + .../pqclean_mceliece6960119_avx/decrypt.c | 236 + .../pqclean_mceliece6960119_avx/decrypt.h | 10 + .../pqclean_mceliece6960119_avx/encrypt.c | 104 + .../pqclean_mceliece6960119_avx/encrypt.h | 11 + .../pqclean_mceliece6960119_avx/fft.c | 262 + .../pqclean_mceliece6960119_avx/fft.h | 17 + .../pqclean_mceliece6960119_avx/fft_tr.c | 400 + .../pqclean_mceliece6960119_avx/fft_tr.h | 14 + .../pqclean_mceliece6960119_avx/gf.c | 203 + .../pqclean_mceliece6960119_avx/gf.h | 22 + .../pqclean_mceliece6960119_avx/int32_sort.c | 1211 +++ .../pqclean_mceliece6960119_avx/int32_sort.h | 9 + .../pqclean_mceliece6960119_avx/operations.c | 136 + .../pqclean_mceliece6960119_avx/params.h | 21 + .../pqclean_mceliece6960119_avx/pk_gen.c | 292 + .../pqclean_mceliece6960119_avx/pk_gen.h | 12 + .../scalars_2x.inc | 75 + .../scalars_4x.inc | 91 + .../pqclean_mceliece6960119_avx/sk_gen.c | 98 + .../pqclean_mceliece6960119_avx/sk_gen.h | 16 + .../syndrome_asm.S | 921 ++ .../pqclean_mceliece6960119_avx/transpose.c | 18 + .../pqclean_mceliece6960119_avx/transpose.h | 16 + .../transpose_64x128_sp_asm.S | 8145 ++++++++++++++++ .../transpose_64x256_sp_asm.S | 8145 ++++++++++++++++ .../pqclean_mceliece6960119_avx/uint32_sort.c | 18 + .../pqclean_mceliece6960119_avx/uint32_sort.h | 9 + .../pqclean_mceliece6960119_avx/update_asm.S | 576 ++ .../pqclean_mceliece6960119_avx/util.c | 106 + .../pqclean_mceliece6960119_avx/util.h | 23 + .../pqclean_mceliece6960119_avx/vec128.c | 79 + .../pqclean_mceliece6960119_avx/vec128.h | 41 + .../vec128_mul_asm.S | 1816 ++++ .../pqclean_mceliece6960119_avx/vec256.c | 146 + .../pqclean_mceliece6960119_avx/vec256.h | 45 + .../vec256_ama_asm.S | 2322 +++++ .../vec256_maa_asm.S | 2322 +++++ .../vec256_mul_asm.S | 2127 +++++ .../vec_reduce_asm.S | 513 + .../pqclean_mceliece6960119f_avx/LICENSE | 16 + .../pqclean_mceliece6960119f_avx/aes256ctr.c | 13 + .../pqclean_mceliece6960119f_avx/aes256ctr.h | 17 + .../pqclean_mceliece6960119f_avx/api.h | 31 + .../pqclean_mceliece6960119f_avx/benes.c | 311 + .../pqclean_mceliece6960119f_avx/benes.h | 14 + .../pqclean_mceliece6960119f_avx/bm.c | 215 + .../pqclean_mceliece6960119f_avx/bm.h | 14 + .../pqclean_mceliece6960119f_avx/consts.S | 33 + .../pqclean_mceliece6960119f_avx/consts.inc | 502 + .../controlbits.c | 274 + .../controlbits.h | 15 + .../crypto_hash.h | 7 + .../pqclean_mceliece6960119f_avx/decrypt.c | 236 + .../pqclean_mceliece6960119f_avx/decrypt.h | 10 + .../pqclean_mceliece6960119f_avx/encrypt.c | 104 + .../pqclean_mceliece6960119f_avx/encrypt.h | 11 + .../pqclean_mceliece6960119f_avx/fft.c | 262 + .../pqclean_mceliece6960119f_avx/fft.h | 17 + .../pqclean_mceliece6960119f_avx/fft_tr.c | 400 + .../pqclean_mceliece6960119f_avx/fft_tr.h | 14 + .../pqclean_mceliece6960119f_avx/gf.c | 203 + .../pqclean_mceliece6960119f_avx/gf.h | 22 + .../pqclean_mceliece6960119f_avx/int32_sort.c | 1211 +++ .../pqclean_mceliece6960119f_avx/int32_sort.h | 9 + .../pqclean_mceliece6960119f_avx/operations.c | 136 + .../pqclean_mceliece6960119f_avx/params.h | 21 + .../pqclean_mceliece6960119f_avx/pk_gen.c | 372 + .../pqclean_mceliece6960119f_avx/pk_gen.h | 12 + .../scalars_2x.inc | 75 + .../scalars_4x.inc | 91 + .../pqclean_mceliece6960119f_avx/sk_gen.c | 98 + .../pqclean_mceliece6960119f_avx/sk_gen.h | 16 + .../syndrome_asm.S | 921 ++ .../pqclean_mceliece6960119f_avx/transpose.c | 18 + .../pqclean_mceliece6960119f_avx/transpose.h | 16 + .../transpose_64x128_sp_asm.S | 8145 ++++++++++++++++ .../transpose_64x256_sp_asm.S | 8145 ++++++++++++++++ .../uint32_sort.c | 18 + .../uint32_sort.h | 9 + .../pqclean_mceliece6960119f_avx/update_asm.S | 576 ++ .../pqclean_mceliece6960119f_avx/util.c | 106 + .../pqclean_mceliece6960119f_avx/util.h | 23 + .../pqclean_mceliece6960119f_avx/vec128.c | 79 + .../pqclean_mceliece6960119f_avx/vec128.h | 41 + .../vec128_mul_asm.S | 1816 ++++ .../pqclean_mceliece6960119f_avx/vec256.c | 146 + .../pqclean_mceliece6960119f_avx/vec256.h | 45 + .../vec256_ama_asm.S | 2322 +++++ .../vec256_maa_asm.S | 2322 +++++ .../vec256_mul_asm.S | 2127 +++++ .../vec_reduce_asm.S | 513 + .../pqclean_mceliece8192128_avx/LICENSE | 16 + .../pqclean_mceliece8192128_avx/aes256ctr.c | 13 + .../pqclean_mceliece8192128_avx/aes256ctr.h | 17 + .../pqclean_mceliece8192128_avx/api.h | 32 + .../pqclean_mceliece8192128_avx/benes.c | 311 + .../pqclean_mceliece8192128_avx/benes.h | 14 + .../pqclean_mceliece8192128_avx/bm.c | 219 + .../pqclean_mceliece8192128_avx/bm.h | 14 + .../pqclean_mceliece8192128_avx/consts.S | 33 + .../pqclean_mceliece8192128_avx/consts.inc | 502 + .../pqclean_mceliece8192128_avx/controlbits.c | 274 + .../pqclean_mceliece8192128_avx/controlbits.h | 15 + .../pqclean_mceliece8192128_avx/crypto_hash.h | 7 + .../pqclean_mceliece8192128_avx/decrypt.c | 207 + .../pqclean_mceliece8192128_avx/decrypt.h | 10 + .../pqclean_mceliece8192128_avx/encrypt.c | 80 + .../pqclean_mceliece8192128_avx/encrypt.h | 11 + .../pqclean_mceliece8192128_avx/fft.c | 275 + .../pqclean_mceliece8192128_avx/fft.h | 17 + .../pqclean_mceliece8192128_avx/fft_tr.c | 379 + .../pqclean_mceliece8192128_avx/fft_tr.h | 14 + .../pqclean_mceliece8192128_avx/gf.c | 236 + .../pqclean_mceliece8192128_avx/gf.h | 25 + .../pqclean_mceliece8192128_avx/int32_sort.c | 1211 +++ .../pqclean_mceliece8192128_avx/int32_sort.h | 9 + .../pqclean_mceliece8192128_avx/operations.c | 136 + .../pqclean_mceliece8192128_avx/params.h | 21 + .../pqclean_mceliece8192128_avx/pk_gen.c | 288 + .../pqclean_mceliece8192128_avx/pk_gen.h | 12 + .../pqclean_mceliece8192128_avx/powers.inc | 480 + .../scalars_2x.inc | 75 + .../scalars_4x.inc | 91 + .../pqclean_mceliece8192128_avx/sk_gen.c | 98 + .../pqclean_mceliece8192128_avx/sk_gen.h | 16 + .../syndrome_asm.S | 910 ++ .../pqclean_mceliece8192128_avx/transpose.c | 18 + .../pqclean_mceliece8192128_avx/transpose.h | 15 + .../transpose_64x128_sp_asm.S | 8145 ++++++++++++++++ .../transpose_64x256_sp_asm.S | 8145 ++++++++++++++++ .../pqclean_mceliece8192128_avx/uint32_sort.c | 18 + .../pqclean_mceliece8192128_avx/uint32_sort.h | 9 + .../pqclean_mceliece8192128_avx/update_asm.S | 576 ++ .../pqclean_mceliece8192128_avx/util.c | 97 + .../pqclean_mceliece8192128_avx/util.h | 23 + .../pqclean_mceliece8192128_avx/vec128.c | 79 + .../pqclean_mceliece8192128_avx/vec128.h | 41 + .../vec128_mul_asm.S | 1816 ++++ .../pqclean_mceliece8192128_avx/vec256.c | 146 + .../pqclean_mceliece8192128_avx/vec256.h | 45 + .../vec256_ama_asm.S | 2322 +++++ .../vec256_maa_asm.S | 2322 +++++ .../vec256_mul_asm.S | 2127 +++++ .../vec_reduce_asm.S | 513 + .../pqclean_mceliece8192128f_avx/LICENSE | 16 + .../pqclean_mceliece8192128f_avx/aes256ctr.c | 13 + .../pqclean_mceliece8192128f_avx/aes256ctr.h | 17 + .../pqclean_mceliece8192128f_avx/api.h | 32 + .../pqclean_mceliece8192128f_avx/benes.c | 311 + .../pqclean_mceliece8192128f_avx/benes.h | 14 + .../pqclean_mceliece8192128f_avx/bm.c | 219 + .../pqclean_mceliece8192128f_avx/bm.h | 14 + .../pqclean_mceliece8192128f_avx/consts.S | 33 + .../pqclean_mceliece8192128f_avx/consts.inc | 502 + .../controlbits.c | 274 + .../controlbits.h | 15 + .../crypto_hash.h | 7 + .../pqclean_mceliece8192128f_avx/decrypt.c | 207 + .../pqclean_mceliece8192128f_avx/decrypt.h | 10 + .../pqclean_mceliece8192128f_avx/encrypt.c | 80 + .../pqclean_mceliece8192128f_avx/encrypt.h | 11 + .../pqclean_mceliece8192128f_avx/fft.c | 275 + .../pqclean_mceliece8192128f_avx/fft.h | 17 + .../pqclean_mceliece8192128f_avx/fft_tr.c | 379 + .../pqclean_mceliece8192128f_avx/fft_tr.h | 14 + .../pqclean_mceliece8192128f_avx/gf.c | 236 + .../pqclean_mceliece8192128f_avx/gf.h | 25 + .../pqclean_mceliece8192128f_avx/int32_sort.c | 1211 +++ .../pqclean_mceliece8192128f_avx/int32_sort.h | 9 + .../pqclean_mceliece8192128f_avx/operations.c | 136 + .../pqclean_mceliece8192128f_avx/params.h | 21 + .../pqclean_mceliece8192128f_avx/pk_gen.c | 355 + .../pqclean_mceliece8192128f_avx/pk_gen.h | 12 + .../pqclean_mceliece8192128f_avx/powers.inc | 480 + .../scalars_2x.inc | 75 + .../scalars_4x.inc | 91 + .../pqclean_mceliece8192128f_avx/sk_gen.c | 98 + .../pqclean_mceliece8192128f_avx/sk_gen.h | 16 + .../syndrome_asm.S | 910 ++ .../pqclean_mceliece8192128f_avx/transpose.c | 18 + .../pqclean_mceliece8192128f_avx/transpose.h | 15 + .../transpose_64x128_sp_asm.S | 8145 ++++++++++++++++ .../transpose_64x256_sp_asm.S | 8145 ++++++++++++++++ .../uint32_sort.c | 18 + .../uint32_sort.h | 9 + .../pqclean_mceliece8192128f_avx/update_asm.S | 576 ++ .../pqclean_mceliece8192128f_avx/util.c | 97 + .../pqclean_mceliece8192128f_avx/util.h | 23 + .../pqclean_mceliece8192128f_avx/vec128.c | 79 + .../pqclean_mceliece8192128f_avx/vec128.h | 41 + .../vec128_mul_asm.S | 1816 ++++ .../pqclean_mceliece8192128f_avx/vec256.c | 146 + .../pqclean_mceliece8192128f_avx/vec256.h | 45 + .../vec256_ama_asm.S | 2322 +++++ .../vec256_maa_asm.S | 2322 +++++ .../vec256_mul_asm.S | 2127 +++++ .../vec_reduce_asm.S | 513 + src/oqsconfig.h.cmake | 10 + .../falcon/pqclean_falcon-1024_avx2/pqclean.c | 2 +- .../pqclean_falcon-1024_clean/pqclean.c | 2 +- src/sig/falcon/sig_falcon_1024.c | 2 +- src/sig/falcon/sig_falcon_512.c | 2 +- tests/KATs/sig/kats.json | 2 +- 627 files changed, 342041 insertions(+), 6 deletions(-) create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/LICENSE create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/aes256ctr.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/aes256ctr.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/api.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/benes.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/benes.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/bm.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/bm.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/consts.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/consts.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/controlbits.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/controlbits.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/crypto_hash.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/decrypt.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/decrypt.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/encrypt.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/encrypt.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/fft.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/fft.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/fft_tr.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/fft_tr.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/gf.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/gf.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/int32_sort.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/int32_sort.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/operations.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/params.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/pk_gen.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/pk_gen.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/powers.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/scalars.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/scalars_2x.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/sk_gen.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/sk_gen.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/syndrome_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/transpose.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/transpose.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/transpose_64x256_sp_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/transpose_64x64_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/uint32_sort.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/uint32_sort.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/update_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/util.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/util.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec128.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec128.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec128_mul_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec256.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec256.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec256_mul_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec_mul_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec_mul_sp_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec_reduce_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/LICENSE create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/aes256ctr.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/aes256ctr.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/api.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/benes.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/benes.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/bm.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/bm.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/controlbits.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/controlbits.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/crypto_hash.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/decrypt.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/decrypt.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/encrypt.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/encrypt.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/gf.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/gf.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/operations.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/params.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/pk_gen.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/pk_gen.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/root.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/root.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/sk_gen.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/sk_gen.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/synd.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/synd.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/transpose.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/transpose.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/util.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_clean/util.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/LICENSE create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/aes256ctr.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/aes256ctr.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/api.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/benes.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/benes.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/bm.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/bm.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/consts.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/consts.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/controlbits.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/controlbits.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/crypto_hash.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/decrypt.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/decrypt.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/encrypt.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/encrypt.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/fft.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/fft.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/fft_tr.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/fft_tr.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/gf.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/gf.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/operations.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/params.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/pk_gen.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/pk_gen.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/powers.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/scalars.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/scalars_2x.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/sk_gen.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/sk_gen.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/syndrome_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/transpose.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/transpose.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/transpose_64x128_sp_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/transpose_64x64_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/update_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/util.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/util.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec128.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec128.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec128_mul_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec_mul_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec_reduce_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/LICENSE create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/aes256ctr.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/aes256ctr.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/api.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/benes.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/benes.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/bm.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/bm.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/consts.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/consts.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/controlbits.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/controlbits.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/crypto_hash.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/decrypt.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/decrypt.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/encrypt.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/encrypt.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/fft.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/fft.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/fft_tr.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/fft_tr.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/gf.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/gf.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/int32_sort.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/int32_sort.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/operations.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/params.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/pk_gen.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/pk_gen.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/powers.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/scalars.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/scalars_2x.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/sk_gen.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/sk_gen.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/syndrome_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/transpose.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/transpose.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/transpose_64x256_sp_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/transpose_64x64_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/uint32_sort.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/uint32_sort.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/update_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/util.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/util.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec128.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec128.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec128_mul_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec256.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec256.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec256_mul_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec_mul_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec_mul_sp_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec_reduce_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/LICENSE create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/aes256ctr.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/aes256ctr.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/api.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/benes.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/benes.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/bm.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/bm.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/consts.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/consts.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/controlbits.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/controlbits.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/crypto_hash.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/decrypt.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/decrypt.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/encrypt.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/encrypt.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/fft.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/fft.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/fft_tr.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/fft_tr.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/gf.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/gf.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/int32_sort.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/int32_sort.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/operations.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/params.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/pk_gen.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/pk_gen.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/scalars_2x.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/scalars_4x.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/sk_gen.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/sk_gen.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/syndrome_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/transpose.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/transpose.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/transpose_64x128_sp_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/transpose_64x256_sp_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/uint32_sort.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/uint32_sort.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/update_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/util.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/util.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/vec128.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/vec128.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/vec128_mul_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/vec256.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/vec256.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/vec256_ama_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/vec256_maa_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/vec256_mul_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896_avx/vec_reduce_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/LICENSE create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/aes256ctr.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/aes256ctr.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/api.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/benes.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/benes.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/bm.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/bm.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/consts.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/consts.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/controlbits.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/controlbits.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/crypto_hash.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/decrypt.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/decrypt.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/encrypt.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/encrypt.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/fft.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/fft.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/fft_tr.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/fft_tr.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/gf.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/gf.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/int32_sort.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/int32_sort.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/operations.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/params.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/pk_gen.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/pk_gen.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/scalars_2x.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/scalars_4x.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/sk_gen.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/sk_gen.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/syndrome_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/transpose.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/transpose.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/transpose_64x128_sp_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/transpose_64x256_sp_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/uint32_sort.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/uint32_sort.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/update_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/util.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/util.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/vec128.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/vec128.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/vec128_mul_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/vec256.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/vec256.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/vec256_ama_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/vec256_maa_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/vec256_mul_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece460896f_avx/vec_reduce_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/LICENSE create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/aes256ctr.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/aes256ctr.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/api.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/benes.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/benes.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/bm.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/bm.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/consts.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/consts.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/controlbits.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/controlbits.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/crypto_hash.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/decrypt.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/decrypt.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/encrypt.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/encrypt.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/fft.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/fft.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/fft_tr.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/fft_tr.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/gf.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/gf.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/int32_sort.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/int32_sort.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/operations.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/params.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/pk_gen.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/pk_gen.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/powers.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/scalars_2x.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/scalars_4x.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/sk_gen.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/sk_gen.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/syndrome_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/transpose.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/transpose.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/transpose_64x128_sp_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/transpose_64x256_sp_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/uint32_sort.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/uint32_sort.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/update_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/util.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/util.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/vec128.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/vec128.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/vec128_mul_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/vec256.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/vec256.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/vec256_ama_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/vec256_maa_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/vec256_mul_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128_avx/vec_reduce_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/LICENSE create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/aes256ctr.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/aes256ctr.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/api.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/benes.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/benes.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/bm.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/bm.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/consts.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/consts.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/controlbits.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/controlbits.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/crypto_hash.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/decrypt.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/decrypt.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/encrypt.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/encrypt.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/fft.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/fft.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/fft_tr.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/fft_tr.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/gf.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/gf.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/int32_sort.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/int32_sort.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/operations.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/params.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/pk_gen.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/pk_gen.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/powers.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/scalars_2x.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/scalars_4x.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/sk_gen.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/sk_gen.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/syndrome_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/transpose.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/transpose.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/transpose_64x128_sp_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/transpose_64x256_sp_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/uint32_sort.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/uint32_sort.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/update_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/util.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/util.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/vec128.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/vec128.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/vec128_mul_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/vec256.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/vec256.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/vec256_ama_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/vec256_maa_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/vec256_mul_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/vec_reduce_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/LICENSE create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/aes256ctr.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/aes256ctr.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/api.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/benes.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/benes.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/bm.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/bm.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/consts.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/consts.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/controlbits.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/controlbits.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/crypto_hash.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/decrypt.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/decrypt.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/encrypt.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/encrypt.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/fft.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/fft.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/fft_tr.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/fft_tr.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/gf.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/gf.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/int32_sort.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/int32_sort.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/operations.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/params.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/pk_gen.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/pk_gen.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/scalars_2x.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/scalars_4x.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/sk_gen.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/sk_gen.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/syndrome_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/transpose.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/transpose.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/transpose_64x128_sp_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/transpose_64x256_sp_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/uint32_sort.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/uint32_sort.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/update_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/util.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/util.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/vec128.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/vec128.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/vec128_mul_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/vec256.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/vec256.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/vec256_ama_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/vec256_maa_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/vec256_mul_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119_avx/vec_reduce_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/LICENSE create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/aes256ctr.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/aes256ctr.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/api.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/benes.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/benes.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/bm.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/bm.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/consts.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/consts.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/controlbits.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/controlbits.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/crypto_hash.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/decrypt.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/decrypt.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/encrypt.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/encrypt.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/fft.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/fft.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/fft_tr.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/fft_tr.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/gf.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/gf.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/int32_sort.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/int32_sort.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/operations.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/params.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/pk_gen.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/pk_gen.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/scalars_2x.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/scalars_4x.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/sk_gen.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/sk_gen.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/syndrome_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/transpose.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/transpose.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/transpose_64x128_sp_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/transpose_64x256_sp_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/uint32_sort.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/uint32_sort.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/update_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/util.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/util.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/vec128.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/vec128.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/vec128_mul_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/vec256.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/vec256.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/vec256_ama_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/vec256_maa_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/vec256_mul_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/vec_reduce_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/LICENSE create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/aes256ctr.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/aes256ctr.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/api.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/benes.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/benes.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/bm.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/bm.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/consts.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/consts.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/controlbits.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/controlbits.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/crypto_hash.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/decrypt.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/decrypt.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/encrypt.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/encrypt.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/fft.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/fft.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/fft_tr.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/fft_tr.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/gf.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/gf.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/int32_sort.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/int32_sort.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/operations.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/params.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/pk_gen.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/pk_gen.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/powers.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/scalars_2x.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/scalars_4x.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/sk_gen.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/sk_gen.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/syndrome_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/transpose.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/transpose.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/transpose_64x128_sp_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/transpose_64x256_sp_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/uint32_sort.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/uint32_sort.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/update_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/util.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/util.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/vec128.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/vec128.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/vec128_mul_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/vec256.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/vec256.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/vec256_ama_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/vec256_maa_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/vec256_mul_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128_avx/vec_reduce_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/LICENSE create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/aes256ctr.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/aes256ctr.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/api.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/benes.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/benes.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/bm.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/bm.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/consts.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/consts.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/controlbits.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/controlbits.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/crypto_hash.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/decrypt.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/decrypt.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/encrypt.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/encrypt.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/fft.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/fft.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/fft_tr.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/fft_tr.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/gf.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/gf.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/int32_sort.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/int32_sort.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/operations.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/params.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/pk_gen.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/pk_gen.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/powers.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/scalars_2x.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/scalars_4x.inc create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/sk_gen.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/sk_gen.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/syndrome_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/transpose.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/transpose.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/transpose_64x128_sp_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/transpose_64x256_sp_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/uint32_sort.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/uint32_sort.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/update_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/util.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/util.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/vec128.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/vec128.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/vec128_mul_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/vec256.c create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/vec256.h create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/vec256_ama_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/vec256_maa_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/vec256_mul_asm.S create mode 100644 src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/vec_reduce_asm.S diff --git a/.CMake/alg_support.cmake b/.CMake/alg_support.cmake index 12fd384b78..b72890d8a6 100644 --- a/.CMake/alg_support.cmake +++ b/.CMake/alg_support.cmake @@ -67,15 +67,45 @@ cmake_dependent_option(OQS_ENABLE_SIG_picnic3_L5 "" ON "OQS_ENABLE_SIG_PICNIC" O ##### OQS_COPY_FROM_UPSTREAM_FRAGMENT_ADD_ENABLE_BY_ALG_START option(OQS_ENABLE_KEM_CLASSIC_MCELIECE "" ON) cmake_dependent_option(OQS_ENABLE_KEM_classic_mceliece_348864 "" ON "OQS_ENABLE_KEM_CLASSIC_MCELIECE" OFF) +if(ARCH STREQUAL "x86_64" AND CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin" AND OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS) + cmake_dependent_option(OQS_ENABLE_KEM_classic_mceliece_348864_avx "" ON "OQS_ENABLE_KEM_classic_mceliece_348864" OFF) +endif() cmake_dependent_option(OQS_ENABLE_KEM_classic_mceliece_348864f "" ON "OQS_ENABLE_KEM_CLASSIC_MCELIECE" OFF) +if(ARCH STREQUAL "x86_64" AND CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin" AND OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS AND OQS_USE_BMI1_INSTRUCTIONS) + cmake_dependent_option(OQS_ENABLE_KEM_classic_mceliece_348864f_avx "" ON "OQS_ENABLE_KEM_classic_mceliece_348864f" OFF) +endif() cmake_dependent_option(OQS_ENABLE_KEM_classic_mceliece_460896 "" ON "OQS_ENABLE_KEM_CLASSIC_MCELIECE" OFF) +if(ARCH STREQUAL "x86_64" AND CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin" AND OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS) + cmake_dependent_option(OQS_ENABLE_KEM_classic_mceliece_460896_avx "" ON "OQS_ENABLE_KEM_classic_mceliece_460896" OFF) +endif() cmake_dependent_option(OQS_ENABLE_KEM_classic_mceliece_460896f "" ON "OQS_ENABLE_KEM_CLASSIC_MCELIECE" OFF) +if(ARCH STREQUAL "x86_64" AND CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin" AND OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_BMI1_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS) + cmake_dependent_option(OQS_ENABLE_KEM_classic_mceliece_460896f_avx "" ON "OQS_ENABLE_KEM_classic_mceliece_460896f" OFF) +endif() cmake_dependent_option(OQS_ENABLE_KEM_classic_mceliece_6688128 "" ON "OQS_ENABLE_KEM_CLASSIC_MCELIECE" OFF) +if(ARCH STREQUAL "x86_64" AND CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin" AND OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS) + cmake_dependent_option(OQS_ENABLE_KEM_classic_mceliece_6688128_avx "" ON "OQS_ENABLE_KEM_classic_mceliece_6688128" OFF) +endif() cmake_dependent_option(OQS_ENABLE_KEM_classic_mceliece_6688128f "" ON "OQS_ENABLE_KEM_CLASSIC_MCELIECE" OFF) +if(ARCH STREQUAL "x86_64" AND CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin" AND OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_BMI1_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS) + cmake_dependent_option(OQS_ENABLE_KEM_classic_mceliece_6688128f_avx "" ON "OQS_ENABLE_KEM_classic_mceliece_6688128f" OFF) +endif() cmake_dependent_option(OQS_ENABLE_KEM_classic_mceliece_6960119 "" ON "OQS_ENABLE_KEM_CLASSIC_MCELIECE" OFF) +if(ARCH STREQUAL "x86_64" AND CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin" AND OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS) + cmake_dependent_option(OQS_ENABLE_KEM_classic_mceliece_6960119_avx "" ON "OQS_ENABLE_KEM_classic_mceliece_6960119" OFF) +endif() cmake_dependent_option(OQS_ENABLE_KEM_classic_mceliece_6960119f "" ON "OQS_ENABLE_KEM_CLASSIC_MCELIECE" OFF) +if(ARCH STREQUAL "x86_64" AND CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin" AND OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_BMI1_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS) + cmake_dependent_option(OQS_ENABLE_KEM_classic_mceliece_6960119f_avx "" ON "OQS_ENABLE_KEM_classic_mceliece_6960119f" OFF) +endif() cmake_dependent_option(OQS_ENABLE_KEM_classic_mceliece_8192128 "" ON "OQS_ENABLE_KEM_CLASSIC_MCELIECE" OFF) +if(ARCH STREQUAL "x86_64" AND CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin" AND OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS) + cmake_dependent_option(OQS_ENABLE_KEM_classic_mceliece_8192128_avx "" ON "OQS_ENABLE_KEM_classic_mceliece_8192128" OFF) +endif() cmake_dependent_option(OQS_ENABLE_KEM_classic_mceliece_8192128f "" ON "OQS_ENABLE_KEM_CLASSIC_MCELIECE" OFF) +if(ARCH STREQUAL "x86_64" AND CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin" AND OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS AND OQS_USE_BMI1_INSTRUCTIONS) + cmake_dependent_option(OQS_ENABLE_KEM_classic_mceliece_8192128f_avx "" ON "OQS_ENABLE_KEM_classic_mceliece_8192128f" OFF) +endif() option(OQS_ENABLE_KEM_HQC "" ON) cmake_dependent_option(OQS_ENABLE_KEM_hqc_128 "" ON "OQS_ENABLE_KEM_HQC" OFF) diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml index fccc83303c..25b3621984 100644 --- a/scripts/copy_from_upstream/copy_from_upstream.yml +++ b/scripts/copy_from_upstream/copy_from_upstream.yml @@ -3,7 +3,7 @@ upstreams: name: pqclean git_url: https://github.com/PQClean/PQClean.git git_branch: master - git_commit: 3d7d2024fa892bd7f00dca3fff4122175f4a26dc + git_commit: ebcc71c51a30b6e5db4f1fade22999b346fdafce kem_meta_path: 'crypto_kem/{pqclean_scheme}/META.yml' sig_meta_path: 'crypto_sign/{pqclean_scheme}/META.yml' kem_scheme_path: 'crypto_kem/{pqclean_scheme}' diff --git a/src/kem/classic_mceliece/CMakeLists.txt b/src/kem/classic_mceliece/CMakeLists.txt index 4a9809ef43..1b8f58d590 100644 --- a/src/kem/classic_mceliece/CMakeLists.txt +++ b/src/kem/classic_mceliece/CMakeLists.txt @@ -12,6 +12,14 @@ if(OQS_ENABLE_KEM_classic_mceliece_348864) set(_CLASSIC_MCELIECE_OBJS ${_CLASSIC_MCELIECE_OBJS} $) endif() +if(OQS_ENABLE_KEM_classic_mceliece_348864_avx) + add_library(classic_mceliece_348864_avx OBJECT pqclean_mceliece348864_avx/aes256ctr.c pqclean_mceliece348864_avx/benes.c pqclean_mceliece348864_avx/bm.c pqclean_mceliece348864_avx/consts.S pqclean_mceliece348864_avx/controlbits.c pqclean_mceliece348864_avx/decrypt.c pqclean_mceliece348864_avx/encrypt.c pqclean_mceliece348864_avx/fft.c pqclean_mceliece348864_avx/fft_tr.c pqclean_mceliece348864_avx/gf.c pqclean_mceliece348864_avx/int32_sort.c pqclean_mceliece348864_avx/operations.c pqclean_mceliece348864_avx/pk_gen.c pqclean_mceliece348864_avx/sk_gen.c pqclean_mceliece348864_avx/syndrome_asm.S pqclean_mceliece348864_avx/transpose.c pqclean_mceliece348864_avx/transpose_64x256_sp_asm.S pqclean_mceliece348864_avx/transpose_64x64_asm.S pqclean_mceliece348864_avx/uint32_sort.c pqclean_mceliece348864_avx/update_asm.S pqclean_mceliece348864_avx/util.c pqclean_mceliece348864_avx/vec.c pqclean_mceliece348864_avx/vec128.c pqclean_mceliece348864_avx/vec128_mul_asm.S pqclean_mceliece348864_avx/vec256.c pqclean_mceliece348864_avx/vec256_mul_asm.S pqclean_mceliece348864_avx/vec_mul_asm.S pqclean_mceliece348864_avx/vec_mul_sp_asm.S pqclean_mceliece348864_avx/vec_reduce_asm.S) + target_include_directories(classic_mceliece_348864_avx PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_mceliece348864_avx) + target_include_directories(classic_mceliece_348864_avx PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + target_compile_options(classic_mceliece_348864_avx PRIVATE -mavx2 -mpopcnt) + set(_CLASSIC_MCELIECE_OBJS ${_CLASSIC_MCELIECE_OBJS} $) +endif() + if(OQS_ENABLE_KEM_classic_mceliece_348864f) add_library(classic_mceliece_348864f_vec OBJECT kem_classic_mceliece_348864f.c pqclean_mceliece348864f_vec/aes256ctr.c pqclean_mceliece348864f_vec/benes.c pqclean_mceliece348864f_vec/bm.c pqclean_mceliece348864f_vec/controlbits.c pqclean_mceliece348864f_vec/decrypt.c pqclean_mceliece348864f_vec/encrypt.c pqclean_mceliece348864f_vec/fft.c pqclean_mceliece348864f_vec/fft_tr.c pqclean_mceliece348864f_vec/gf.c pqclean_mceliece348864f_vec/operations.c pqclean_mceliece348864f_vec/pk_gen.c pqclean_mceliece348864f_vec/sk_gen.c pqclean_mceliece348864f_vec/transpose.c pqclean_mceliece348864f_vec/util.c pqclean_mceliece348864f_vec/vec.c) target_include_directories(classic_mceliece_348864f_vec PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_mceliece348864f_vec) @@ -19,6 +27,14 @@ if(OQS_ENABLE_KEM_classic_mceliece_348864f) set(_CLASSIC_MCELIECE_OBJS ${_CLASSIC_MCELIECE_OBJS} $) endif() +if(OQS_ENABLE_KEM_classic_mceliece_348864f_avx) + add_library(classic_mceliece_348864f_avx OBJECT pqclean_mceliece348864f_avx/aes256ctr.c pqclean_mceliece348864f_avx/benes.c pqclean_mceliece348864f_avx/bm.c pqclean_mceliece348864f_avx/consts.S pqclean_mceliece348864f_avx/controlbits.c pqclean_mceliece348864f_avx/decrypt.c pqclean_mceliece348864f_avx/encrypt.c pqclean_mceliece348864f_avx/fft.c pqclean_mceliece348864f_avx/fft_tr.c pqclean_mceliece348864f_avx/gf.c pqclean_mceliece348864f_avx/int32_sort.c pqclean_mceliece348864f_avx/operations.c pqclean_mceliece348864f_avx/pk_gen.c pqclean_mceliece348864f_avx/sk_gen.c pqclean_mceliece348864f_avx/syndrome_asm.S pqclean_mceliece348864f_avx/transpose.c pqclean_mceliece348864f_avx/transpose_64x256_sp_asm.S pqclean_mceliece348864f_avx/transpose_64x64_asm.S pqclean_mceliece348864f_avx/uint32_sort.c pqclean_mceliece348864f_avx/update_asm.S pqclean_mceliece348864f_avx/util.c pqclean_mceliece348864f_avx/vec.c pqclean_mceliece348864f_avx/vec128.c pqclean_mceliece348864f_avx/vec128_mul_asm.S pqclean_mceliece348864f_avx/vec256.c pqclean_mceliece348864f_avx/vec256_mul_asm.S pqclean_mceliece348864f_avx/vec_mul_asm.S pqclean_mceliece348864f_avx/vec_mul_sp_asm.S pqclean_mceliece348864f_avx/vec_reduce_asm.S) + target_include_directories(classic_mceliece_348864f_avx PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_mceliece348864f_avx) + target_include_directories(classic_mceliece_348864f_avx PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + target_compile_options(classic_mceliece_348864f_avx PRIVATE -mavx2 -mpopcnt -mbmi) + set(_CLASSIC_MCELIECE_OBJS ${_CLASSIC_MCELIECE_OBJS} $) +endif() + if(OQS_ENABLE_KEM_classic_mceliece_460896) add_library(classic_mceliece_460896_vec OBJECT kem_classic_mceliece_460896.c pqclean_mceliece460896_vec/aes256ctr.c pqclean_mceliece460896_vec/benes.c pqclean_mceliece460896_vec/bm.c pqclean_mceliece460896_vec/controlbits.c pqclean_mceliece460896_vec/decrypt.c pqclean_mceliece460896_vec/encrypt.c pqclean_mceliece460896_vec/fft.c pqclean_mceliece460896_vec/fft_tr.c pqclean_mceliece460896_vec/gf.c pqclean_mceliece460896_vec/operations.c pqclean_mceliece460896_vec/pk_gen.c pqclean_mceliece460896_vec/sk_gen.c pqclean_mceliece460896_vec/transpose.c pqclean_mceliece460896_vec/util.c pqclean_mceliece460896_vec/vec.c) target_include_directories(classic_mceliece_460896_vec PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_mceliece460896_vec) @@ -26,6 +42,14 @@ if(OQS_ENABLE_KEM_classic_mceliece_460896) set(_CLASSIC_MCELIECE_OBJS ${_CLASSIC_MCELIECE_OBJS} $) endif() +if(OQS_ENABLE_KEM_classic_mceliece_460896_avx) + add_library(classic_mceliece_460896_avx OBJECT pqclean_mceliece460896_avx/aes256ctr.c pqclean_mceliece460896_avx/benes.c pqclean_mceliece460896_avx/bm.c pqclean_mceliece460896_avx/consts.S pqclean_mceliece460896_avx/controlbits.c pqclean_mceliece460896_avx/decrypt.c pqclean_mceliece460896_avx/encrypt.c pqclean_mceliece460896_avx/fft.c pqclean_mceliece460896_avx/fft_tr.c pqclean_mceliece460896_avx/gf.c pqclean_mceliece460896_avx/int32_sort.c pqclean_mceliece460896_avx/operations.c pqclean_mceliece460896_avx/pk_gen.c pqclean_mceliece460896_avx/sk_gen.c pqclean_mceliece460896_avx/syndrome_asm.S pqclean_mceliece460896_avx/transpose.c pqclean_mceliece460896_avx/transpose_64x128_sp_asm.S pqclean_mceliece460896_avx/transpose_64x256_sp_asm.S pqclean_mceliece460896_avx/uint32_sort.c pqclean_mceliece460896_avx/update_asm.S pqclean_mceliece460896_avx/util.c pqclean_mceliece460896_avx/vec128.c pqclean_mceliece460896_avx/vec128_mul_asm.S pqclean_mceliece460896_avx/vec256.c pqclean_mceliece460896_avx/vec256_ama_asm.S pqclean_mceliece460896_avx/vec256_maa_asm.S pqclean_mceliece460896_avx/vec256_mul_asm.S pqclean_mceliece460896_avx/vec_reduce_asm.S) + target_include_directories(classic_mceliece_460896_avx PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_mceliece460896_avx) + target_include_directories(classic_mceliece_460896_avx PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + target_compile_options(classic_mceliece_460896_avx PRIVATE -mavx2 -mpopcnt) + set(_CLASSIC_MCELIECE_OBJS ${_CLASSIC_MCELIECE_OBJS} $) +endif() + if(OQS_ENABLE_KEM_classic_mceliece_460896f) add_library(classic_mceliece_460896f_vec OBJECT kem_classic_mceliece_460896f.c pqclean_mceliece460896f_vec/aes256ctr.c pqclean_mceliece460896f_vec/benes.c pqclean_mceliece460896f_vec/bm.c pqclean_mceliece460896f_vec/controlbits.c pqclean_mceliece460896f_vec/decrypt.c pqclean_mceliece460896f_vec/encrypt.c pqclean_mceliece460896f_vec/fft.c pqclean_mceliece460896f_vec/fft_tr.c pqclean_mceliece460896f_vec/gf.c pqclean_mceliece460896f_vec/operations.c pqclean_mceliece460896f_vec/pk_gen.c pqclean_mceliece460896f_vec/sk_gen.c pqclean_mceliece460896f_vec/transpose.c pqclean_mceliece460896f_vec/util.c pqclean_mceliece460896f_vec/vec.c) target_include_directories(classic_mceliece_460896f_vec PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_mceliece460896f_vec) @@ -33,6 +57,14 @@ if(OQS_ENABLE_KEM_classic_mceliece_460896f) set(_CLASSIC_MCELIECE_OBJS ${_CLASSIC_MCELIECE_OBJS} $) endif() +if(OQS_ENABLE_KEM_classic_mceliece_460896f_avx) + add_library(classic_mceliece_460896f_avx OBJECT pqclean_mceliece460896f_avx/aes256ctr.c pqclean_mceliece460896f_avx/benes.c pqclean_mceliece460896f_avx/bm.c pqclean_mceliece460896f_avx/consts.S pqclean_mceliece460896f_avx/controlbits.c pqclean_mceliece460896f_avx/decrypt.c pqclean_mceliece460896f_avx/encrypt.c pqclean_mceliece460896f_avx/fft.c pqclean_mceliece460896f_avx/fft_tr.c pqclean_mceliece460896f_avx/gf.c pqclean_mceliece460896f_avx/int32_sort.c pqclean_mceliece460896f_avx/operations.c pqclean_mceliece460896f_avx/pk_gen.c pqclean_mceliece460896f_avx/sk_gen.c pqclean_mceliece460896f_avx/syndrome_asm.S pqclean_mceliece460896f_avx/transpose.c pqclean_mceliece460896f_avx/transpose_64x128_sp_asm.S pqclean_mceliece460896f_avx/transpose_64x256_sp_asm.S pqclean_mceliece460896f_avx/uint32_sort.c pqclean_mceliece460896f_avx/update_asm.S pqclean_mceliece460896f_avx/util.c pqclean_mceliece460896f_avx/vec128.c pqclean_mceliece460896f_avx/vec128_mul_asm.S pqclean_mceliece460896f_avx/vec256.c pqclean_mceliece460896f_avx/vec256_ama_asm.S pqclean_mceliece460896f_avx/vec256_maa_asm.S pqclean_mceliece460896f_avx/vec256_mul_asm.S pqclean_mceliece460896f_avx/vec_reduce_asm.S) + target_include_directories(classic_mceliece_460896f_avx PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_mceliece460896f_avx) + target_include_directories(classic_mceliece_460896f_avx PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + target_compile_options(classic_mceliece_460896f_avx PRIVATE -mavx2 -mbmi -mpopcnt) + set(_CLASSIC_MCELIECE_OBJS ${_CLASSIC_MCELIECE_OBJS} $) +endif() + if(OQS_ENABLE_KEM_classic_mceliece_6688128) add_library(classic_mceliece_6688128_vec OBJECT kem_classic_mceliece_6688128.c pqclean_mceliece6688128_vec/aes256ctr.c pqclean_mceliece6688128_vec/benes.c pqclean_mceliece6688128_vec/bm.c pqclean_mceliece6688128_vec/controlbits.c pqclean_mceliece6688128_vec/decrypt.c pqclean_mceliece6688128_vec/encrypt.c pqclean_mceliece6688128_vec/fft.c pqclean_mceliece6688128_vec/fft_tr.c pqclean_mceliece6688128_vec/gf.c pqclean_mceliece6688128_vec/operations.c pqclean_mceliece6688128_vec/pk_gen.c pqclean_mceliece6688128_vec/sk_gen.c pqclean_mceliece6688128_vec/transpose.c pqclean_mceliece6688128_vec/util.c pqclean_mceliece6688128_vec/vec.c) target_include_directories(classic_mceliece_6688128_vec PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_mceliece6688128_vec) @@ -40,6 +72,14 @@ if(OQS_ENABLE_KEM_classic_mceliece_6688128) set(_CLASSIC_MCELIECE_OBJS ${_CLASSIC_MCELIECE_OBJS} $) endif() +if(OQS_ENABLE_KEM_classic_mceliece_6688128_avx) + add_library(classic_mceliece_6688128_avx OBJECT pqclean_mceliece6688128_avx/aes256ctr.c pqclean_mceliece6688128_avx/benes.c pqclean_mceliece6688128_avx/bm.c pqclean_mceliece6688128_avx/consts.S pqclean_mceliece6688128_avx/controlbits.c pqclean_mceliece6688128_avx/decrypt.c pqclean_mceliece6688128_avx/encrypt.c pqclean_mceliece6688128_avx/fft.c pqclean_mceliece6688128_avx/fft_tr.c pqclean_mceliece6688128_avx/gf.c pqclean_mceliece6688128_avx/int32_sort.c pqclean_mceliece6688128_avx/operations.c pqclean_mceliece6688128_avx/pk_gen.c pqclean_mceliece6688128_avx/sk_gen.c pqclean_mceliece6688128_avx/syndrome_asm.S pqclean_mceliece6688128_avx/transpose.c pqclean_mceliece6688128_avx/transpose_64x128_sp_asm.S pqclean_mceliece6688128_avx/transpose_64x256_sp_asm.S pqclean_mceliece6688128_avx/uint32_sort.c pqclean_mceliece6688128_avx/update_asm.S pqclean_mceliece6688128_avx/util.c pqclean_mceliece6688128_avx/vec128.c pqclean_mceliece6688128_avx/vec128_mul_asm.S pqclean_mceliece6688128_avx/vec256.c pqclean_mceliece6688128_avx/vec256_ama_asm.S pqclean_mceliece6688128_avx/vec256_maa_asm.S pqclean_mceliece6688128_avx/vec256_mul_asm.S pqclean_mceliece6688128_avx/vec_reduce_asm.S) + target_include_directories(classic_mceliece_6688128_avx PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_mceliece6688128_avx) + target_include_directories(classic_mceliece_6688128_avx PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + target_compile_options(classic_mceliece_6688128_avx PRIVATE -mavx2 -mpopcnt) + set(_CLASSIC_MCELIECE_OBJS ${_CLASSIC_MCELIECE_OBJS} $) +endif() + if(OQS_ENABLE_KEM_classic_mceliece_6688128f) add_library(classic_mceliece_6688128f_vec OBJECT kem_classic_mceliece_6688128f.c pqclean_mceliece6688128f_vec/aes256ctr.c pqclean_mceliece6688128f_vec/benes.c pqclean_mceliece6688128f_vec/bm.c pqclean_mceliece6688128f_vec/controlbits.c pqclean_mceliece6688128f_vec/decrypt.c pqclean_mceliece6688128f_vec/encrypt.c pqclean_mceliece6688128f_vec/fft.c pqclean_mceliece6688128f_vec/fft_tr.c pqclean_mceliece6688128f_vec/gf.c pqclean_mceliece6688128f_vec/operations.c pqclean_mceliece6688128f_vec/pk_gen.c pqclean_mceliece6688128f_vec/sk_gen.c pqclean_mceliece6688128f_vec/transpose.c pqclean_mceliece6688128f_vec/util.c pqclean_mceliece6688128f_vec/vec.c) target_include_directories(classic_mceliece_6688128f_vec PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_mceliece6688128f_vec) @@ -47,6 +87,14 @@ if(OQS_ENABLE_KEM_classic_mceliece_6688128f) set(_CLASSIC_MCELIECE_OBJS ${_CLASSIC_MCELIECE_OBJS} $) endif() +if(OQS_ENABLE_KEM_classic_mceliece_6688128f_avx) + add_library(classic_mceliece_6688128f_avx OBJECT pqclean_mceliece6688128f_avx/aes256ctr.c pqclean_mceliece6688128f_avx/benes.c pqclean_mceliece6688128f_avx/bm.c pqclean_mceliece6688128f_avx/consts.S pqclean_mceliece6688128f_avx/controlbits.c pqclean_mceliece6688128f_avx/decrypt.c pqclean_mceliece6688128f_avx/encrypt.c pqclean_mceliece6688128f_avx/fft.c pqclean_mceliece6688128f_avx/fft_tr.c pqclean_mceliece6688128f_avx/gf.c pqclean_mceliece6688128f_avx/int32_sort.c pqclean_mceliece6688128f_avx/operations.c pqclean_mceliece6688128f_avx/pk_gen.c pqclean_mceliece6688128f_avx/sk_gen.c pqclean_mceliece6688128f_avx/syndrome_asm.S pqclean_mceliece6688128f_avx/transpose.c pqclean_mceliece6688128f_avx/transpose_64x128_sp_asm.S pqclean_mceliece6688128f_avx/transpose_64x256_sp_asm.S pqclean_mceliece6688128f_avx/uint32_sort.c pqclean_mceliece6688128f_avx/update_asm.S pqclean_mceliece6688128f_avx/util.c pqclean_mceliece6688128f_avx/vec128.c pqclean_mceliece6688128f_avx/vec128_mul_asm.S pqclean_mceliece6688128f_avx/vec256.c pqclean_mceliece6688128f_avx/vec256_ama_asm.S pqclean_mceliece6688128f_avx/vec256_maa_asm.S pqclean_mceliece6688128f_avx/vec256_mul_asm.S pqclean_mceliece6688128f_avx/vec_reduce_asm.S) + target_include_directories(classic_mceliece_6688128f_avx PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_mceliece6688128f_avx) + target_include_directories(classic_mceliece_6688128f_avx PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + target_compile_options(classic_mceliece_6688128f_avx PRIVATE -mavx2 -mbmi -mpopcnt) + set(_CLASSIC_MCELIECE_OBJS ${_CLASSIC_MCELIECE_OBJS} $) +endif() + if(OQS_ENABLE_KEM_classic_mceliece_6960119) add_library(classic_mceliece_6960119_vec OBJECT kem_classic_mceliece_6960119.c pqclean_mceliece6960119_vec/aes256ctr.c pqclean_mceliece6960119_vec/benes.c pqclean_mceliece6960119_vec/bm.c pqclean_mceliece6960119_vec/controlbits.c pqclean_mceliece6960119_vec/decrypt.c pqclean_mceliece6960119_vec/encrypt.c pqclean_mceliece6960119_vec/fft.c pqclean_mceliece6960119_vec/fft_tr.c pqclean_mceliece6960119_vec/gf.c pqclean_mceliece6960119_vec/operations.c pqclean_mceliece6960119_vec/pk_gen.c pqclean_mceliece6960119_vec/sk_gen.c pqclean_mceliece6960119_vec/transpose.c pqclean_mceliece6960119_vec/util.c pqclean_mceliece6960119_vec/vec.c) target_include_directories(classic_mceliece_6960119_vec PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_mceliece6960119_vec) @@ -54,6 +102,14 @@ if(OQS_ENABLE_KEM_classic_mceliece_6960119) set(_CLASSIC_MCELIECE_OBJS ${_CLASSIC_MCELIECE_OBJS} $) endif() +if(OQS_ENABLE_KEM_classic_mceliece_6960119_avx) + add_library(classic_mceliece_6960119_avx OBJECT pqclean_mceliece6960119_avx/aes256ctr.c pqclean_mceliece6960119_avx/benes.c pqclean_mceliece6960119_avx/bm.c pqclean_mceliece6960119_avx/consts.S pqclean_mceliece6960119_avx/controlbits.c pqclean_mceliece6960119_avx/decrypt.c pqclean_mceliece6960119_avx/encrypt.c pqclean_mceliece6960119_avx/fft.c pqclean_mceliece6960119_avx/fft_tr.c pqclean_mceliece6960119_avx/gf.c pqclean_mceliece6960119_avx/int32_sort.c pqclean_mceliece6960119_avx/operations.c pqclean_mceliece6960119_avx/pk_gen.c pqclean_mceliece6960119_avx/sk_gen.c pqclean_mceliece6960119_avx/syndrome_asm.S pqclean_mceliece6960119_avx/transpose.c pqclean_mceliece6960119_avx/transpose_64x128_sp_asm.S pqclean_mceliece6960119_avx/transpose_64x256_sp_asm.S pqclean_mceliece6960119_avx/uint32_sort.c pqclean_mceliece6960119_avx/update_asm.S pqclean_mceliece6960119_avx/util.c pqclean_mceliece6960119_avx/vec128.c pqclean_mceliece6960119_avx/vec128_mul_asm.S pqclean_mceliece6960119_avx/vec256.c pqclean_mceliece6960119_avx/vec256_ama_asm.S pqclean_mceliece6960119_avx/vec256_maa_asm.S pqclean_mceliece6960119_avx/vec256_mul_asm.S pqclean_mceliece6960119_avx/vec_reduce_asm.S) + target_include_directories(classic_mceliece_6960119_avx PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_mceliece6960119_avx) + target_include_directories(classic_mceliece_6960119_avx PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + target_compile_options(classic_mceliece_6960119_avx PRIVATE -mavx2 -mpopcnt) + set(_CLASSIC_MCELIECE_OBJS ${_CLASSIC_MCELIECE_OBJS} $) +endif() + if(OQS_ENABLE_KEM_classic_mceliece_6960119f) add_library(classic_mceliece_6960119f_vec OBJECT kem_classic_mceliece_6960119f.c pqclean_mceliece6960119f_vec/aes256ctr.c pqclean_mceliece6960119f_vec/benes.c pqclean_mceliece6960119f_vec/bm.c pqclean_mceliece6960119f_vec/controlbits.c pqclean_mceliece6960119f_vec/decrypt.c pqclean_mceliece6960119f_vec/encrypt.c pqclean_mceliece6960119f_vec/fft.c pqclean_mceliece6960119f_vec/fft_tr.c pqclean_mceliece6960119f_vec/gf.c pqclean_mceliece6960119f_vec/operations.c pqclean_mceliece6960119f_vec/pk_gen.c pqclean_mceliece6960119f_vec/sk_gen.c pqclean_mceliece6960119f_vec/transpose.c pqclean_mceliece6960119f_vec/util.c pqclean_mceliece6960119f_vec/vec.c) target_include_directories(classic_mceliece_6960119f_vec PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_mceliece6960119f_vec) @@ -61,6 +117,14 @@ if(OQS_ENABLE_KEM_classic_mceliece_6960119f) set(_CLASSIC_MCELIECE_OBJS ${_CLASSIC_MCELIECE_OBJS} $) endif() +if(OQS_ENABLE_KEM_classic_mceliece_6960119f_avx) + add_library(classic_mceliece_6960119f_avx OBJECT pqclean_mceliece6960119f_avx/aes256ctr.c pqclean_mceliece6960119f_avx/benes.c pqclean_mceliece6960119f_avx/bm.c pqclean_mceliece6960119f_avx/consts.S pqclean_mceliece6960119f_avx/controlbits.c pqclean_mceliece6960119f_avx/decrypt.c pqclean_mceliece6960119f_avx/encrypt.c pqclean_mceliece6960119f_avx/fft.c pqclean_mceliece6960119f_avx/fft_tr.c pqclean_mceliece6960119f_avx/gf.c pqclean_mceliece6960119f_avx/int32_sort.c pqclean_mceliece6960119f_avx/operations.c pqclean_mceliece6960119f_avx/pk_gen.c pqclean_mceliece6960119f_avx/sk_gen.c pqclean_mceliece6960119f_avx/syndrome_asm.S pqclean_mceliece6960119f_avx/transpose.c pqclean_mceliece6960119f_avx/transpose_64x128_sp_asm.S pqclean_mceliece6960119f_avx/transpose_64x256_sp_asm.S pqclean_mceliece6960119f_avx/uint32_sort.c pqclean_mceliece6960119f_avx/update_asm.S pqclean_mceliece6960119f_avx/util.c pqclean_mceliece6960119f_avx/vec128.c pqclean_mceliece6960119f_avx/vec128_mul_asm.S pqclean_mceliece6960119f_avx/vec256.c pqclean_mceliece6960119f_avx/vec256_ama_asm.S pqclean_mceliece6960119f_avx/vec256_maa_asm.S pqclean_mceliece6960119f_avx/vec256_mul_asm.S pqclean_mceliece6960119f_avx/vec_reduce_asm.S) + target_include_directories(classic_mceliece_6960119f_avx PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_mceliece6960119f_avx) + target_include_directories(classic_mceliece_6960119f_avx PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + target_compile_options(classic_mceliece_6960119f_avx PRIVATE -mavx2 -mbmi -mpopcnt) + set(_CLASSIC_MCELIECE_OBJS ${_CLASSIC_MCELIECE_OBJS} $) +endif() + if(OQS_ENABLE_KEM_classic_mceliece_8192128) add_library(classic_mceliece_8192128_vec OBJECT kem_classic_mceliece_8192128.c pqclean_mceliece8192128_vec/aes256ctr.c pqclean_mceliece8192128_vec/benes.c pqclean_mceliece8192128_vec/bm.c pqclean_mceliece8192128_vec/controlbits.c pqclean_mceliece8192128_vec/decrypt.c pqclean_mceliece8192128_vec/encrypt.c pqclean_mceliece8192128_vec/fft.c pqclean_mceliece8192128_vec/fft_tr.c pqclean_mceliece8192128_vec/gf.c pqclean_mceliece8192128_vec/operations.c pqclean_mceliece8192128_vec/pk_gen.c pqclean_mceliece8192128_vec/sk_gen.c pqclean_mceliece8192128_vec/transpose.c pqclean_mceliece8192128_vec/util.c pqclean_mceliece8192128_vec/vec.c) target_include_directories(classic_mceliece_8192128_vec PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_mceliece8192128_vec) @@ -68,6 +132,14 @@ if(OQS_ENABLE_KEM_classic_mceliece_8192128) set(_CLASSIC_MCELIECE_OBJS ${_CLASSIC_MCELIECE_OBJS} $) endif() +if(OQS_ENABLE_KEM_classic_mceliece_8192128_avx) + add_library(classic_mceliece_8192128_avx OBJECT pqclean_mceliece8192128_avx/aes256ctr.c pqclean_mceliece8192128_avx/benes.c pqclean_mceliece8192128_avx/bm.c pqclean_mceliece8192128_avx/consts.S pqclean_mceliece8192128_avx/controlbits.c pqclean_mceliece8192128_avx/decrypt.c pqclean_mceliece8192128_avx/encrypt.c pqclean_mceliece8192128_avx/fft.c pqclean_mceliece8192128_avx/fft_tr.c pqclean_mceliece8192128_avx/gf.c pqclean_mceliece8192128_avx/int32_sort.c pqclean_mceliece8192128_avx/operations.c pqclean_mceliece8192128_avx/pk_gen.c pqclean_mceliece8192128_avx/sk_gen.c pqclean_mceliece8192128_avx/syndrome_asm.S pqclean_mceliece8192128_avx/transpose.c pqclean_mceliece8192128_avx/transpose_64x128_sp_asm.S pqclean_mceliece8192128_avx/transpose_64x256_sp_asm.S pqclean_mceliece8192128_avx/uint32_sort.c pqclean_mceliece8192128_avx/update_asm.S pqclean_mceliece8192128_avx/util.c pqclean_mceliece8192128_avx/vec128.c pqclean_mceliece8192128_avx/vec128_mul_asm.S pqclean_mceliece8192128_avx/vec256.c pqclean_mceliece8192128_avx/vec256_ama_asm.S pqclean_mceliece8192128_avx/vec256_maa_asm.S pqclean_mceliece8192128_avx/vec256_mul_asm.S pqclean_mceliece8192128_avx/vec_reduce_asm.S) + target_include_directories(classic_mceliece_8192128_avx PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_mceliece8192128_avx) + target_include_directories(classic_mceliece_8192128_avx PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + target_compile_options(classic_mceliece_8192128_avx PRIVATE -mavx2 -mpopcnt) + set(_CLASSIC_MCELIECE_OBJS ${_CLASSIC_MCELIECE_OBJS} $) +endif() + if(OQS_ENABLE_KEM_classic_mceliece_8192128f) add_library(classic_mceliece_8192128f_vec OBJECT kem_classic_mceliece_8192128f.c pqclean_mceliece8192128f_vec/aes256ctr.c pqclean_mceliece8192128f_vec/benes.c pqclean_mceliece8192128f_vec/bm.c pqclean_mceliece8192128f_vec/controlbits.c pqclean_mceliece8192128f_vec/decrypt.c pqclean_mceliece8192128f_vec/encrypt.c pqclean_mceliece8192128f_vec/fft.c pqclean_mceliece8192128f_vec/fft_tr.c pqclean_mceliece8192128f_vec/gf.c pqclean_mceliece8192128f_vec/operations.c pqclean_mceliece8192128f_vec/pk_gen.c pqclean_mceliece8192128f_vec/sk_gen.c pqclean_mceliece8192128f_vec/transpose.c pqclean_mceliece8192128f_vec/util.c pqclean_mceliece8192128f_vec/vec.c) target_include_directories(classic_mceliece_8192128f_vec PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_mceliece8192128f_vec) @@ -75,4 +147,12 @@ if(OQS_ENABLE_KEM_classic_mceliece_8192128f) set(_CLASSIC_MCELIECE_OBJS ${_CLASSIC_MCELIECE_OBJS} $) endif() +if(OQS_ENABLE_KEM_classic_mceliece_8192128f_avx) + add_library(classic_mceliece_8192128f_avx OBJECT pqclean_mceliece8192128f_avx/aes256ctr.c pqclean_mceliece8192128f_avx/benes.c pqclean_mceliece8192128f_avx/bm.c pqclean_mceliece8192128f_avx/consts.S pqclean_mceliece8192128f_avx/controlbits.c pqclean_mceliece8192128f_avx/decrypt.c pqclean_mceliece8192128f_avx/encrypt.c pqclean_mceliece8192128f_avx/fft.c pqclean_mceliece8192128f_avx/fft_tr.c pqclean_mceliece8192128f_avx/gf.c pqclean_mceliece8192128f_avx/int32_sort.c pqclean_mceliece8192128f_avx/operations.c pqclean_mceliece8192128f_avx/pk_gen.c pqclean_mceliece8192128f_avx/sk_gen.c pqclean_mceliece8192128f_avx/syndrome_asm.S pqclean_mceliece8192128f_avx/transpose.c pqclean_mceliece8192128f_avx/transpose_64x128_sp_asm.S pqclean_mceliece8192128f_avx/transpose_64x256_sp_asm.S pqclean_mceliece8192128f_avx/uint32_sort.c pqclean_mceliece8192128f_avx/update_asm.S pqclean_mceliece8192128f_avx/util.c pqclean_mceliece8192128f_avx/vec128.c pqclean_mceliece8192128f_avx/vec128_mul_asm.S pqclean_mceliece8192128f_avx/vec256.c pqclean_mceliece8192128f_avx/vec256_ama_asm.S pqclean_mceliece8192128f_avx/vec256_maa_asm.S pqclean_mceliece8192128f_avx/vec256_mul_asm.S pqclean_mceliece8192128f_avx/vec_reduce_asm.S) + target_include_directories(classic_mceliece_8192128f_avx PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_mceliece8192128f_avx) + target_include_directories(classic_mceliece_8192128f_avx PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + target_compile_options(classic_mceliece_8192128f_avx PRIVATE -mavx2 -mpopcnt -mbmi) + set(_CLASSIC_MCELIECE_OBJS ${_CLASSIC_MCELIECE_OBJS} $) +endif() + set(CLASSIC_MCELIECE_OBJS ${_CLASSIC_MCELIECE_OBJS} PARENT_SCOPE) diff --git a/src/kem/classic_mceliece/kem_classic_mceliece_348864.c b/src/kem/classic_mceliece/kem_classic_mceliece_348864.c index 4d771f37b1..459755b64a 100644 --- a/src/kem/classic_mceliece/kem_classic_mceliece_348864.c +++ b/src/kem/classic_mceliece/kem_classic_mceliece_348864.c @@ -34,16 +34,61 @@ extern int PQCLEAN_MCELIECE348864_VEC_crypto_kem_keypair(unsigned char *pk, unsi extern int PQCLEAN_MCELIECE348864_VEC_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); extern int PQCLEAN_MCELIECE348864_VEC_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); +#if defined(OQS_ENABLE_KEM_classic_mceliece_348864_avx) +extern int PQCLEAN_MCELIECE348864_AVX_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); +extern int PQCLEAN_MCELIECE348864_AVX_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); +extern int PQCLEAN_MCELIECE348864_AVX_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); +#endif + OQS_API OQS_STATUS OQS_KEM_classic_mceliece_348864_keypair(uint8_t *public_key, uint8_t *secret_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_348864_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.POPCNT_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE348864_AVX_crypto_kem_keypair(public_key, secret_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE348864_VEC_crypto_kem_keypair(public_key, secret_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE348864_VEC_crypto_kem_keypair(public_key, secret_key); +#endif } OQS_API OQS_STATUS OQS_KEM_classic_mceliece_348864_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_348864_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.POPCNT_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE348864_AVX_crypto_kem_enc(ciphertext, shared_secret, public_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE348864_VEC_crypto_kem_enc(ciphertext, shared_secret, public_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE348864_VEC_crypto_kem_enc(ciphertext, shared_secret, public_key); +#endif } OQS_API OQS_STATUS OQS_KEM_classic_mceliece_348864_decaps(uint8_t *shared_secret, const unsigned char *ciphertext, const uint8_t *secret_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_348864_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.POPCNT_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE348864_AVX_crypto_kem_dec(shared_secret, ciphertext, secret_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE348864_VEC_crypto_kem_dec(shared_secret, ciphertext, secret_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE348864_VEC_crypto_kem_dec(shared_secret, ciphertext, secret_key); +#endif } #endif diff --git a/src/kem/classic_mceliece/kem_classic_mceliece_348864f.c b/src/kem/classic_mceliece/kem_classic_mceliece_348864f.c index b929fa8f8e..e3c1f14acd 100644 --- a/src/kem/classic_mceliece/kem_classic_mceliece_348864f.c +++ b/src/kem/classic_mceliece/kem_classic_mceliece_348864f.c @@ -34,16 +34,61 @@ extern int PQCLEAN_MCELIECE348864F_VEC_crypto_kem_keypair(unsigned char *pk, uns extern int PQCLEAN_MCELIECE348864F_VEC_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); extern int PQCLEAN_MCELIECE348864F_VEC_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); +#if defined(OQS_ENABLE_KEM_classic_mceliece_348864f_avx) +extern int PQCLEAN_MCELIECE348864F_AVX_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); +extern int PQCLEAN_MCELIECE348864F_AVX_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); +extern int PQCLEAN_MCELIECE348864F_AVX_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); +#endif + OQS_API OQS_STATUS OQS_KEM_classic_mceliece_348864f_keypair(uint8_t *public_key, uint8_t *secret_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_348864f_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.POPCNT_ENABLED && available_cpu_extensions.BMI1_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE348864F_AVX_crypto_kem_keypair(public_key, secret_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE348864F_VEC_crypto_kem_keypair(public_key, secret_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE348864F_VEC_crypto_kem_keypair(public_key, secret_key); +#endif } OQS_API OQS_STATUS OQS_KEM_classic_mceliece_348864f_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_348864f_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.POPCNT_ENABLED && available_cpu_extensions.BMI1_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE348864F_AVX_crypto_kem_enc(ciphertext, shared_secret, public_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE348864F_VEC_crypto_kem_enc(ciphertext, shared_secret, public_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE348864F_VEC_crypto_kem_enc(ciphertext, shared_secret, public_key); +#endif } OQS_API OQS_STATUS OQS_KEM_classic_mceliece_348864f_decaps(uint8_t *shared_secret, const unsigned char *ciphertext, const uint8_t *secret_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_348864f_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.POPCNT_ENABLED && available_cpu_extensions.BMI1_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE348864F_AVX_crypto_kem_dec(shared_secret, ciphertext, secret_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE348864F_VEC_crypto_kem_dec(shared_secret, ciphertext, secret_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE348864F_VEC_crypto_kem_dec(shared_secret, ciphertext, secret_key); +#endif } #endif diff --git a/src/kem/classic_mceliece/kem_classic_mceliece_460896.c b/src/kem/classic_mceliece/kem_classic_mceliece_460896.c index 9ecd6a369a..584a7a3362 100644 --- a/src/kem/classic_mceliece/kem_classic_mceliece_460896.c +++ b/src/kem/classic_mceliece/kem_classic_mceliece_460896.c @@ -34,16 +34,61 @@ extern int PQCLEAN_MCELIECE460896_VEC_crypto_kem_keypair(unsigned char *pk, unsi extern int PQCLEAN_MCELIECE460896_VEC_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); extern int PQCLEAN_MCELIECE460896_VEC_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); +#if defined(OQS_ENABLE_KEM_classic_mceliece_460896_avx) +extern int PQCLEAN_MCELIECE460896_AVX_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); +extern int PQCLEAN_MCELIECE460896_AVX_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); +extern int PQCLEAN_MCELIECE460896_AVX_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); +#endif + OQS_API OQS_STATUS OQS_KEM_classic_mceliece_460896_keypair(uint8_t *public_key, uint8_t *secret_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_460896_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.POPCNT_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE460896_AVX_crypto_kem_keypair(public_key, secret_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE460896_VEC_crypto_kem_keypair(public_key, secret_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE460896_VEC_crypto_kem_keypair(public_key, secret_key); +#endif } OQS_API OQS_STATUS OQS_KEM_classic_mceliece_460896_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_460896_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.POPCNT_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE460896_AVX_crypto_kem_enc(ciphertext, shared_secret, public_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE460896_VEC_crypto_kem_enc(ciphertext, shared_secret, public_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE460896_VEC_crypto_kem_enc(ciphertext, shared_secret, public_key); +#endif } OQS_API OQS_STATUS OQS_KEM_classic_mceliece_460896_decaps(uint8_t *shared_secret, const unsigned char *ciphertext, const uint8_t *secret_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_460896_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.POPCNT_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE460896_AVX_crypto_kem_dec(shared_secret, ciphertext, secret_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE460896_VEC_crypto_kem_dec(shared_secret, ciphertext, secret_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE460896_VEC_crypto_kem_dec(shared_secret, ciphertext, secret_key); +#endif } #endif diff --git a/src/kem/classic_mceliece/kem_classic_mceliece_460896f.c b/src/kem/classic_mceliece/kem_classic_mceliece_460896f.c index d7accd3b3d..858ad9c51c 100644 --- a/src/kem/classic_mceliece/kem_classic_mceliece_460896f.c +++ b/src/kem/classic_mceliece/kem_classic_mceliece_460896f.c @@ -34,16 +34,61 @@ extern int PQCLEAN_MCELIECE460896F_VEC_crypto_kem_keypair(unsigned char *pk, uns extern int PQCLEAN_MCELIECE460896F_VEC_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); extern int PQCLEAN_MCELIECE460896F_VEC_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); +#if defined(OQS_ENABLE_KEM_classic_mceliece_460896f_avx) +extern int PQCLEAN_MCELIECE460896F_AVX_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); +extern int PQCLEAN_MCELIECE460896F_AVX_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); +extern int PQCLEAN_MCELIECE460896F_AVX_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); +#endif + OQS_API OQS_STATUS OQS_KEM_classic_mceliece_460896f_keypair(uint8_t *public_key, uint8_t *secret_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_460896f_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.BMI1_ENABLED && available_cpu_extensions.POPCNT_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE460896F_AVX_crypto_kem_keypair(public_key, secret_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE460896F_VEC_crypto_kem_keypair(public_key, secret_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE460896F_VEC_crypto_kem_keypair(public_key, secret_key); +#endif } OQS_API OQS_STATUS OQS_KEM_classic_mceliece_460896f_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_460896f_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.BMI1_ENABLED && available_cpu_extensions.POPCNT_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE460896F_AVX_crypto_kem_enc(ciphertext, shared_secret, public_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE460896F_VEC_crypto_kem_enc(ciphertext, shared_secret, public_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE460896F_VEC_crypto_kem_enc(ciphertext, shared_secret, public_key); +#endif } OQS_API OQS_STATUS OQS_KEM_classic_mceliece_460896f_decaps(uint8_t *shared_secret, const unsigned char *ciphertext, const uint8_t *secret_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_460896f_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.BMI1_ENABLED && available_cpu_extensions.POPCNT_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE460896F_AVX_crypto_kem_dec(shared_secret, ciphertext, secret_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE460896F_VEC_crypto_kem_dec(shared_secret, ciphertext, secret_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE460896F_VEC_crypto_kem_dec(shared_secret, ciphertext, secret_key); +#endif } #endif diff --git a/src/kem/classic_mceliece/kem_classic_mceliece_6688128.c b/src/kem/classic_mceliece/kem_classic_mceliece_6688128.c index 07854f4fc8..257d7161c1 100644 --- a/src/kem/classic_mceliece/kem_classic_mceliece_6688128.c +++ b/src/kem/classic_mceliece/kem_classic_mceliece_6688128.c @@ -34,16 +34,61 @@ extern int PQCLEAN_MCELIECE6688128_VEC_crypto_kem_keypair(unsigned char *pk, uns extern int PQCLEAN_MCELIECE6688128_VEC_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); extern int PQCLEAN_MCELIECE6688128_VEC_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); +#if defined(OQS_ENABLE_KEM_classic_mceliece_6688128_avx) +extern int PQCLEAN_MCELIECE6688128_AVX_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); +extern int PQCLEAN_MCELIECE6688128_AVX_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); +extern int PQCLEAN_MCELIECE6688128_AVX_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); +#endif + OQS_API OQS_STATUS OQS_KEM_classic_mceliece_6688128_keypair(uint8_t *public_key, uint8_t *secret_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_6688128_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.POPCNT_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE6688128_AVX_crypto_kem_keypair(public_key, secret_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE6688128_VEC_crypto_kem_keypair(public_key, secret_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE6688128_VEC_crypto_kem_keypair(public_key, secret_key); +#endif } OQS_API OQS_STATUS OQS_KEM_classic_mceliece_6688128_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_6688128_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.POPCNT_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE6688128_AVX_crypto_kem_enc(ciphertext, shared_secret, public_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE6688128_VEC_crypto_kem_enc(ciphertext, shared_secret, public_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE6688128_VEC_crypto_kem_enc(ciphertext, shared_secret, public_key); +#endif } OQS_API OQS_STATUS OQS_KEM_classic_mceliece_6688128_decaps(uint8_t *shared_secret, const unsigned char *ciphertext, const uint8_t *secret_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_6688128_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.POPCNT_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE6688128_AVX_crypto_kem_dec(shared_secret, ciphertext, secret_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE6688128_VEC_crypto_kem_dec(shared_secret, ciphertext, secret_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE6688128_VEC_crypto_kem_dec(shared_secret, ciphertext, secret_key); +#endif } #endif diff --git a/src/kem/classic_mceliece/kem_classic_mceliece_6688128f.c b/src/kem/classic_mceliece/kem_classic_mceliece_6688128f.c index ed2149a515..d15907b001 100644 --- a/src/kem/classic_mceliece/kem_classic_mceliece_6688128f.c +++ b/src/kem/classic_mceliece/kem_classic_mceliece_6688128f.c @@ -34,16 +34,61 @@ extern int PQCLEAN_MCELIECE6688128F_VEC_crypto_kem_keypair(unsigned char *pk, un extern int PQCLEAN_MCELIECE6688128F_VEC_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); extern int PQCLEAN_MCELIECE6688128F_VEC_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); +#if defined(OQS_ENABLE_KEM_classic_mceliece_6688128f_avx) +extern int PQCLEAN_MCELIECE6688128F_AVX_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); +extern int PQCLEAN_MCELIECE6688128F_AVX_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); +extern int PQCLEAN_MCELIECE6688128F_AVX_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); +#endif + OQS_API OQS_STATUS OQS_KEM_classic_mceliece_6688128f_keypair(uint8_t *public_key, uint8_t *secret_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_6688128f_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.BMI1_ENABLED && available_cpu_extensions.POPCNT_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE6688128F_AVX_crypto_kem_keypair(public_key, secret_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE6688128F_VEC_crypto_kem_keypair(public_key, secret_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE6688128F_VEC_crypto_kem_keypair(public_key, secret_key); +#endif } OQS_API OQS_STATUS OQS_KEM_classic_mceliece_6688128f_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_6688128f_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.BMI1_ENABLED && available_cpu_extensions.POPCNT_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE6688128F_AVX_crypto_kem_enc(ciphertext, shared_secret, public_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE6688128F_VEC_crypto_kem_enc(ciphertext, shared_secret, public_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE6688128F_VEC_crypto_kem_enc(ciphertext, shared_secret, public_key); +#endif } OQS_API OQS_STATUS OQS_KEM_classic_mceliece_6688128f_decaps(uint8_t *shared_secret, const unsigned char *ciphertext, const uint8_t *secret_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_6688128f_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.BMI1_ENABLED && available_cpu_extensions.POPCNT_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE6688128F_AVX_crypto_kem_dec(shared_secret, ciphertext, secret_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE6688128F_VEC_crypto_kem_dec(shared_secret, ciphertext, secret_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE6688128F_VEC_crypto_kem_dec(shared_secret, ciphertext, secret_key); +#endif } #endif diff --git a/src/kem/classic_mceliece/kem_classic_mceliece_6960119.c b/src/kem/classic_mceliece/kem_classic_mceliece_6960119.c index 1f68827698..57b46a1232 100644 --- a/src/kem/classic_mceliece/kem_classic_mceliece_6960119.c +++ b/src/kem/classic_mceliece/kem_classic_mceliece_6960119.c @@ -34,16 +34,61 @@ extern int PQCLEAN_MCELIECE6960119_VEC_crypto_kem_keypair(unsigned char *pk, uns extern int PQCLEAN_MCELIECE6960119_VEC_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); extern int PQCLEAN_MCELIECE6960119_VEC_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); +#if defined(OQS_ENABLE_KEM_classic_mceliece_6960119_avx) +extern int PQCLEAN_MCELIECE6960119_AVX_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); +extern int PQCLEAN_MCELIECE6960119_AVX_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); +extern int PQCLEAN_MCELIECE6960119_AVX_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); +#endif + OQS_API OQS_STATUS OQS_KEM_classic_mceliece_6960119_keypair(uint8_t *public_key, uint8_t *secret_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_6960119_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.POPCNT_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE6960119_AVX_crypto_kem_keypair(public_key, secret_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE6960119_VEC_crypto_kem_keypair(public_key, secret_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE6960119_VEC_crypto_kem_keypair(public_key, secret_key); +#endif } OQS_API OQS_STATUS OQS_KEM_classic_mceliece_6960119_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_6960119_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.POPCNT_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE6960119_AVX_crypto_kem_enc(ciphertext, shared_secret, public_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE6960119_VEC_crypto_kem_enc(ciphertext, shared_secret, public_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE6960119_VEC_crypto_kem_enc(ciphertext, shared_secret, public_key); +#endif } OQS_API OQS_STATUS OQS_KEM_classic_mceliece_6960119_decaps(uint8_t *shared_secret, const unsigned char *ciphertext, const uint8_t *secret_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_6960119_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.POPCNT_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE6960119_AVX_crypto_kem_dec(shared_secret, ciphertext, secret_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE6960119_VEC_crypto_kem_dec(shared_secret, ciphertext, secret_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE6960119_VEC_crypto_kem_dec(shared_secret, ciphertext, secret_key); +#endif } #endif diff --git a/src/kem/classic_mceliece/kem_classic_mceliece_6960119f.c b/src/kem/classic_mceliece/kem_classic_mceliece_6960119f.c index cf18f134c6..3bcaaf88cd 100644 --- a/src/kem/classic_mceliece/kem_classic_mceliece_6960119f.c +++ b/src/kem/classic_mceliece/kem_classic_mceliece_6960119f.c @@ -34,16 +34,61 @@ extern int PQCLEAN_MCELIECE6960119F_VEC_crypto_kem_keypair(unsigned char *pk, un extern int PQCLEAN_MCELIECE6960119F_VEC_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); extern int PQCLEAN_MCELIECE6960119F_VEC_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); +#if defined(OQS_ENABLE_KEM_classic_mceliece_6960119f_avx) +extern int PQCLEAN_MCELIECE6960119F_AVX_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); +extern int PQCLEAN_MCELIECE6960119F_AVX_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); +extern int PQCLEAN_MCELIECE6960119F_AVX_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); +#endif + OQS_API OQS_STATUS OQS_KEM_classic_mceliece_6960119f_keypair(uint8_t *public_key, uint8_t *secret_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_6960119f_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.BMI1_ENABLED && available_cpu_extensions.POPCNT_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE6960119F_AVX_crypto_kem_keypair(public_key, secret_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE6960119F_VEC_crypto_kem_keypair(public_key, secret_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE6960119F_VEC_crypto_kem_keypair(public_key, secret_key); +#endif } OQS_API OQS_STATUS OQS_KEM_classic_mceliece_6960119f_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_6960119f_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.BMI1_ENABLED && available_cpu_extensions.POPCNT_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE6960119F_AVX_crypto_kem_enc(ciphertext, shared_secret, public_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE6960119F_VEC_crypto_kem_enc(ciphertext, shared_secret, public_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE6960119F_VEC_crypto_kem_enc(ciphertext, shared_secret, public_key); +#endif } OQS_API OQS_STATUS OQS_KEM_classic_mceliece_6960119f_decaps(uint8_t *shared_secret, const unsigned char *ciphertext, const uint8_t *secret_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_6960119f_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.BMI1_ENABLED && available_cpu_extensions.POPCNT_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE6960119F_AVX_crypto_kem_dec(shared_secret, ciphertext, secret_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE6960119F_VEC_crypto_kem_dec(shared_secret, ciphertext, secret_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE6960119F_VEC_crypto_kem_dec(shared_secret, ciphertext, secret_key); +#endif } #endif diff --git a/src/kem/classic_mceliece/kem_classic_mceliece_8192128.c b/src/kem/classic_mceliece/kem_classic_mceliece_8192128.c index e722d4646a..38651614bf 100644 --- a/src/kem/classic_mceliece/kem_classic_mceliece_8192128.c +++ b/src/kem/classic_mceliece/kem_classic_mceliece_8192128.c @@ -34,16 +34,61 @@ extern int PQCLEAN_MCELIECE8192128_VEC_crypto_kem_keypair(unsigned char *pk, uns extern int PQCLEAN_MCELIECE8192128_VEC_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); extern int PQCLEAN_MCELIECE8192128_VEC_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); +#if defined(OQS_ENABLE_KEM_classic_mceliece_8192128_avx) +extern int PQCLEAN_MCELIECE8192128_AVX_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); +extern int PQCLEAN_MCELIECE8192128_AVX_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); +extern int PQCLEAN_MCELIECE8192128_AVX_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); +#endif + OQS_API OQS_STATUS OQS_KEM_classic_mceliece_8192128_keypair(uint8_t *public_key, uint8_t *secret_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_8192128_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.POPCNT_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE8192128_AVX_crypto_kem_keypair(public_key, secret_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE8192128_VEC_crypto_kem_keypair(public_key, secret_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE8192128_VEC_crypto_kem_keypair(public_key, secret_key); +#endif } OQS_API OQS_STATUS OQS_KEM_classic_mceliece_8192128_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_8192128_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.POPCNT_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE8192128_AVX_crypto_kem_enc(ciphertext, shared_secret, public_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE8192128_VEC_crypto_kem_enc(ciphertext, shared_secret, public_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE8192128_VEC_crypto_kem_enc(ciphertext, shared_secret, public_key); +#endif } OQS_API OQS_STATUS OQS_KEM_classic_mceliece_8192128_decaps(uint8_t *shared_secret, const unsigned char *ciphertext, const uint8_t *secret_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_8192128_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.POPCNT_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE8192128_AVX_crypto_kem_dec(shared_secret, ciphertext, secret_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE8192128_VEC_crypto_kem_dec(shared_secret, ciphertext, secret_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE8192128_VEC_crypto_kem_dec(shared_secret, ciphertext, secret_key); +#endif } #endif diff --git a/src/kem/classic_mceliece/kem_classic_mceliece_8192128f.c b/src/kem/classic_mceliece/kem_classic_mceliece_8192128f.c index ae7d582132..0687936623 100644 --- a/src/kem/classic_mceliece/kem_classic_mceliece_8192128f.c +++ b/src/kem/classic_mceliece/kem_classic_mceliece_8192128f.c @@ -34,16 +34,61 @@ extern int PQCLEAN_MCELIECE8192128F_VEC_crypto_kem_keypair(unsigned char *pk, un extern int PQCLEAN_MCELIECE8192128F_VEC_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); extern int PQCLEAN_MCELIECE8192128F_VEC_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); +#if defined(OQS_ENABLE_KEM_classic_mceliece_8192128f_avx) +extern int PQCLEAN_MCELIECE8192128F_AVX_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); +extern int PQCLEAN_MCELIECE8192128F_AVX_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); +extern int PQCLEAN_MCELIECE8192128F_AVX_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); +#endif + OQS_API OQS_STATUS OQS_KEM_classic_mceliece_8192128f_keypair(uint8_t *public_key, uint8_t *secret_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_8192128f_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.POPCNT_ENABLED && available_cpu_extensions.BMI1_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE8192128F_AVX_crypto_kem_keypair(public_key, secret_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE8192128F_VEC_crypto_kem_keypair(public_key, secret_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE8192128F_VEC_crypto_kem_keypair(public_key, secret_key); +#endif } OQS_API OQS_STATUS OQS_KEM_classic_mceliece_8192128f_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_8192128f_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.POPCNT_ENABLED && available_cpu_extensions.BMI1_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE8192128F_AVX_crypto_kem_enc(ciphertext, shared_secret, public_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE8192128F_VEC_crypto_kem_enc(ciphertext, shared_secret, public_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE8192128F_VEC_crypto_kem_enc(ciphertext, shared_secret, public_key); +#endif } OQS_API OQS_STATUS OQS_KEM_classic_mceliece_8192128f_decaps(uint8_t *shared_secret, const unsigned char *ciphertext, const uint8_t *secret_key) { +#if defined(OQS_ENABLE_KEM_classic_mceliece_8192128f_avx) +#if defined(OQS_PORTABLE_BUILD) + OQS_CPU_EXTENSIONS available_cpu_extensions = OQS_get_available_CPU_extensions(); + if (available_cpu_extensions.AVX2_ENABLED && available_cpu_extensions.POPCNT_ENABLED && available_cpu_extensions.BMI1_ENABLED) { +#endif /* OQS_PORTABLE_BUILD */ + return (OQS_STATUS) PQCLEAN_MCELIECE8192128F_AVX_crypto_kem_dec(shared_secret, ciphertext, secret_key); +#if defined(OQS_PORTABLE_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_MCELIECE8192128F_VEC_crypto_kem_dec(shared_secret, ciphertext, secret_key); + } +#endif /* OQS_PORTABLE_BUILD */ +#else return (OQS_STATUS) PQCLEAN_MCELIECE8192128F_VEC_crypto_kem_dec(shared_secret, ciphertext, secret_key); +#endif } #endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/LICENSE b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/LICENSE new file mode 100644 index 0000000000..eba3e7ced4 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/LICENSE @@ -0,0 +1,16 @@ +Public Domain. + +Authors of Classic McEliece in alphabetical order: + +Daniel J. Bernstein, University of Illinois at Chicago +Tung Chou, Osaka University +Tanja Lange, Technische Universiteit Eindhoven +Ingo von Maurich, self +Rafael Misoczki, Intel Corporation +Ruben Niederhagen, Fraunhofer SIT +Edoardo Persichetti, Florida Atlantic University +Christiane Peters, self +Peter Schwabe, Radboud University +Nicolas Sendrier, Inria +Jakub Szefer, Yale University +Wen Wang, Yale University diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/aes256ctr.c b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/aes256ctr.c new file mode 100644 index 0000000000..091435868b --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/aes256ctr.c @@ -0,0 +1,13 @@ +#include "aes256ctr.h" + +void PQCLEAN_MCELIECE348864_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES]) { + + aes256ctx state; + aes256_ctr_keyexp(&state, key); + aes256_ctr(out, outlen, nonce, &state); + aes256_ctx_release(&state); +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/aes256ctr.h b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/aes256ctr.h new file mode 100644 index 0000000000..74213af22e --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/aes256ctr.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_AES256CTR_H +#define PQCLEAN_MCELIECE348864_AVX_AES256CTR_H + +#include +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE348864_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/api.h b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/api.h new file mode 100644 index 0000000000..15f2b6ec81 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_API_H +#define PQCLEAN_MCELIECE348864_AVX_API_H + +#include + +#define PQCLEAN_MCELIECE348864_AVX_CRYPTO_ALGNAME "Classic McEliece 348864" +#define PQCLEAN_MCELIECE348864_AVX_CRYPTO_PUBLICKEYBYTES 261120 +#define PQCLEAN_MCELIECE348864_AVX_CRYPTO_SECRETKEYBYTES 6452 +#define PQCLEAN_MCELIECE348864_AVX_CRYPTO_CIPHERTEXTBYTES 128 +#define PQCLEAN_MCELIECE348864_AVX_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE348864_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE348864_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE348864_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/benes.c b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/benes.c new file mode 100644 index 0000000000..d172388008 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/benes.c @@ -0,0 +1,287 @@ +/* + This file is for Benes network related functions +*/ +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_0(uint64_t *bs, const uint64_t *cond) { + int x; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = bs[ x ] ^ bs[ x + 1 ]; + diff &= *cond++; + bs[ x ] ^= diff; + bs[ x + 1 ] ^= diff; + } +} + +static void layer_1(uint64_t *bs, const uint64_t *cond) { + int x; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = bs[ x + 0 ] ^ bs[ x + 2 ]; + diff &= cond[0]; + bs[ x + 0 ] ^= diff; + bs[ x + 2 ] ^= diff; + + diff = bs[ x + 1 ] ^ bs[ x + 3 ]; + diff &= cond[1]; + bs[ x + 1 ] ^= diff; + bs[ x + 3 ] ^= diff; + + cond += 2; + } +} + +static void layer_2(uint64_t *bs, const uint64_t *cond) { + int x; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = bs[ x + 0 ] ^ bs[ x + 4 ]; + diff &= cond[0]; + bs[ x + 0 ] ^= diff; + bs[ x + 4 ] ^= diff; + + diff = bs[ x + 1 ] ^ bs[ x + 5 ]; + diff &= cond[1]; + bs[ x + 1 ] ^= diff; + bs[ x + 5 ] ^= diff; + + diff = bs[ x + 2 ] ^ bs[ x + 6 ]; + diff &= cond[2]; + bs[ x + 2 ] ^= diff; + bs[ x + 6 ] ^= diff; + + diff = bs[ x + 3 ] ^ bs[ x + 7 ]; + diff &= cond[3]; + bs[ x + 3 ] ^= diff; + bs[ x + 7 ] ^= diff; + + cond += 4; + } +} + +static void layer_3(uint64_t *bs, const uint64_t *cond) { + int x, s; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = bs[ s + 0 ] ^ bs[ s + 8 ]; + diff &= cond[0]; + bs[ s + 0 ] ^= diff; + bs[ s + 8 ] ^= diff; + + diff = bs[ s + 1 ] ^ bs[ s + 9 ]; + diff &= cond[1]; + bs[ s + 1 ] ^= diff; + bs[ s + 9 ] ^= diff; + + diff = bs[ s + 2 ] ^ bs[ s + 10 ]; + diff &= cond[2]; + bs[ s + 2 ] ^= diff; + bs[ s + 10 ] ^= diff; + + diff = bs[ s + 3 ] ^ bs[ s + 11 ]; + diff &= cond[3]; + bs[ s + 3 ] ^= diff; + bs[ s + 11 ] ^= diff; + + cond += 4; + } + } +} + +static void layer_4(uint64_t *bs, const uint64_t *cond) { + int x, s; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = bs[ s + 0 ] ^ bs[ s + 16 ]; + diff &= cond[0]; + bs[ s + 0 ] ^= diff; + bs[ s + 16 ] ^= diff; + + diff = bs[ s + 1 ] ^ bs[ s + 17 ]; + diff &= cond[1]; + bs[ s + 1 ] ^= diff; + bs[ s + 17 ] ^= diff; + + diff = bs[ s + 2 ] ^ bs[ s + 18 ]; + diff &= cond[2]; + bs[ s + 2 ] ^= diff; + bs[ s + 18 ] ^= diff; + + diff = bs[ s + 3 ] ^ bs[ s + 19 ]; + diff &= cond[3]; + bs[ s + 3 ] ^= diff; + bs[ s + 19 ] ^= diff; + + cond += 4; + } + } +} + +static void layer_5(uint64_t *bs, const uint64_t *cond) { + int x, s; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = bs[ s + 0 ] ^ bs[ s + 32 ]; + diff &= cond[0]; + bs[ s + 0 ] ^= diff; + bs[ s + 32 ] ^= diff; + + diff = bs[ s + 1 ] ^ bs[ s + 33 ]; + diff &= cond[1]; + bs[ s + 1 ] ^= diff; + bs[ s + 33 ] ^= diff; + + diff = bs[ s + 2 ] ^ bs[ s + 34 ]; + diff &= cond[2]; + bs[ s + 2 ] ^= diff; + bs[ s + 34 ] ^= diff; + + diff = bs[ s + 3 ] ^ bs[ s + 35 ]; + diff &= cond[3]; + bs[ s + 3 ] ^= diff; + bs[ s + 35 ] ^= diff; + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: out, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE348864_AVX_load_bits(uint64_t out[][32], const unsigned char *bits) { + int i, low, block = 0; + + uint64_t cond[64]; + + // + + for (low = 0; low <= 5; low++) { + for (i = 0; i < 64; i++) { + cond[i] = PQCLEAN_MCELIECE348864_AVX_load4(bits + block * 256 + i * 4); + } + PQCLEAN_MCELIECE348864_AVX_transpose_64x64(cond); + + for (i = 0; i < 32; i++) { + out[ block ][i] = cond[i]; + } + block++; + } + + for (low = 0; low <= 5; low++) { + for (i = 0; i < 32; i++) { + out[ block ][i] = PQCLEAN_MCELIECE348864_AVX_load8(bits + block * 256 + i * 8); + } + block++; + } + + for (low = 4; low >= 0; low--) { + for (i = 0; i < 32; i++) { + out[ block ][i] = PQCLEAN_MCELIECE348864_AVX_load8(bits + block * 256 + i * 8); + } + block++; + } + + for (low = 5; low >= 0; low--) { + for (i = 0; i < 64; i++) { + cond[i] = PQCLEAN_MCELIECE348864_AVX_load4(bits + block * 256 + i * 4); + } + PQCLEAN_MCELIECE348864_AVX_transpose_64x64(cond); + + for (i = 0; i < 32; i++) { + out[ block ][i] = cond[i]; + } + block++; + } +} + +/* input: r, sequence of bits to be permuted */ +/* cond, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE348864_AVX_benes(uint64_t *r, uint64_t cond[][32], int rev) { + int block, inc; + + uint64_t *bs = r; + + // + + if (rev == 0) { + block = 0; + inc = 1; + } else { + block = 22; + inc = -1; + } + + PQCLEAN_MCELIECE348864_AVX_transpose_64x64(bs); + + layer_0(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_5(bs, cond[ block ]); + block += inc; + + PQCLEAN_MCELIECE348864_AVX_transpose_64x64(bs); + + layer_0(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_5(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_0(bs, cond[ block ]); + block += inc; + + PQCLEAN_MCELIECE348864_AVX_transpose_64x64(bs); + + layer_5(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_0(bs, cond[ block ]); + //block += inc; + + PQCLEAN_MCELIECE348864_AVX_transpose_64x64(bs); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/benes.h b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/benes.h new file mode 100644 index 0000000000..a417328e39 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/benes.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_BENES_H +#define PQCLEAN_MCELIECE348864_AVX_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "gf.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE348864_AVX_load_bits(uint64_t /*out*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE348864_AVX_benes(uint64_t * /*r*/, uint64_t /*cond*/[][32], int /*rev*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/bm.c b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/bm.c new file mode 100644 index 0000000000..666a48c5c0 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/bm.c @@ -0,0 +1,219 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "util.h" +#include "vec.h" +#include "vec128.h" + +#include + +extern void PQCLEAN_MCELIECE348864_AVX_update_asm(void *, gf, int); +extern gf PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm(uint64_t *); + +static inline uint64_t mask_nonzero(gf a) { + uint64_t ret = a; + + ret -= 1; + ret >>= 63; + ret -= 1; + + return ret; +} + +static inline uint64_t mask_leq(uint16_t a, uint16_t b) { + uint64_t a_tmp = a; + uint64_t b_tmp = b; + uint64_t ret = b_tmp - a_tmp; + + ret >>= 63; + ret -= 1; + + return ret; +} + +static inline void vec_cmov(uint64_t out[][2], uint64_t mask) { + int i; + + for (i = 0; i < GFBITS; i++) { + out[i][0] = (out[i][0] & ~mask) | (out[i][1] & mask); + } +} + +static inline void interleave(vec128 *in, int idx0, int idx1, vec128 *mask, int b) { + int s = 1 << b; + + vec128 x, y; + + x = PQCLEAN_MCELIECE348864_AVX_vec128_or(PQCLEAN_MCELIECE348864_AVX_vec128_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE348864_AVX_vec128_sll_2x(PQCLEAN_MCELIECE348864_AVX_vec128_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE348864_AVX_vec128_or(PQCLEAN_MCELIECE348864_AVX_vec128_srl_2x(PQCLEAN_MCELIECE348864_AVX_vec128_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE348864_AVX_vec128_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec128 *in) { + int i, k; + + vec128 mask[4][2]; + vec128 buf[16]; + + for (i = 0; i < GFBITS; i++) { + buf[i] = in[i]; + } + for (i = GFBITS; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE348864_AVX_vec128_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE348864_AVX_vec128_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE348864_AVX_vec128_extract(buf[i], 1) >> (k * 16)) & GFMASK; + } + } +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +void PQCLEAN_MCELIECE348864_AVX_bm(uint64_t out[ GFBITS ], vec128 in[ GFBITS ]) { + uint16_t i; + uint16_t N, L; + + uint64_t prod[ GFBITS ]; + uint64_t in_tmp[ GFBITS ]; + + uint64_t db[ GFBITS ][ 2 ]; + uint64_t BC_tmp[ GFBITS ][ 2 ]; + uint64_t BC[ GFBITS ][ 2 ]; + + uint64_t mask, t; + + gf d, b, c0 = 1; + + gf coefs[SYS_T * 2]; + + // init + + BC[0][1] = 0; + BC[0][0] = 1; + BC[0][0] <<= 63; + + for (i = 1; i < GFBITS; i++) { + BC[i][0] = BC[i][1] = 0; + } + + b = 1; + L = 0; + + // + + get_coefs(coefs, in); + + for (i = 0; i < GFBITS; i++) { + in_tmp[i] = 0; + } + + for (N = 0; N < SYS_T * 2; N++) { + // computing d + + PQCLEAN_MCELIECE348864_AVX_vec_mul_sp(prod, in_tmp, &BC[0][0]); + + PQCLEAN_MCELIECE348864_AVX_update_asm(in_tmp, coefs[N], 8); + + d = PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm(prod); + + t = PQCLEAN_MCELIECE348864_AVX_gf_mul2(c0, coefs[N], b); + + d ^= t & 0xFFFFFFFF; + + // 3 cases + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + db[i][0] = (d >> i) & 1; + db[i][0] = -db[i][0]; + db[i][1] = (b >> i) & 1; + db[i][1] = -db[i][1]; + } + + PQCLEAN_MCELIECE348864_AVX_vec128_mul((vec128 *) BC_tmp, (vec128 *) db, (vec128 *) BC); + + vec_cmov(BC, mask); + + PQCLEAN_MCELIECE348864_AVX_update_asm(BC, mask & c0, 16); + + for (i = 0; i < GFBITS; i++) { + BC[i][1] = BC_tmp[i][0] ^ BC_tmp[i][1]; + } + + c0 = t >> 32; + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + + } + + c0 = PQCLEAN_MCELIECE348864_AVX_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + out[i] = (c0 >> i) & 1; + out[i] = -out[i]; + } + + PQCLEAN_MCELIECE348864_AVX_vec_mul_sp(out, out, &BC[0][0]); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/bm.h b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/bm.h new file mode 100644 index 0000000000..dd45f43272 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/bm.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_BM_H +#define PQCLEAN_MCELIECE348864_AVX_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE348864_AVX_bm(uint64_t * /*out*/, vec128 * /*in*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/consts.S b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/consts.S new file mode 100644 index 0000000000..385ad55a7a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/consts.S @@ -0,0 +1,33 @@ +.data + +# not supported on MacOS +#.section .rodata + +.globl PQCLEAN_MCELIECE348864_AVX_MASK0_0 +.globl PQCLEAN_MCELIECE348864_AVX_MASK0_1 +.globl PQCLEAN_MCELIECE348864_AVX_MASK1_0 +.globl PQCLEAN_MCELIECE348864_AVX_MASK1_1 +.globl PQCLEAN_MCELIECE348864_AVX_MASK2_0 +.globl PQCLEAN_MCELIECE348864_AVX_MASK2_1 +.globl PQCLEAN_MCELIECE348864_AVX_MASK3_0 +.globl PQCLEAN_MCELIECE348864_AVX_MASK3_1 +.globl PQCLEAN_MCELIECE348864_AVX_MASK4_0 +.globl PQCLEAN_MCELIECE348864_AVX_MASK4_1 +.globl PQCLEAN_MCELIECE348864_AVX_MASK5_0 +.globl PQCLEAN_MCELIECE348864_AVX_MASK5_1 + +.p2align 5 + +PQCLEAN_MCELIECE348864_AVX_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE348864_AVX_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE348864_AVX_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333, 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE348864_AVX_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE348864_AVX_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE348864_AVX_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE348864_AVX_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE348864_AVX_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE348864_AVX_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE348864_AVX_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE348864_AVX_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE348864_AVX_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/consts.inc b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/consts.inc new file mode 100644 index 0000000000..73b3812249 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/consts.inc @@ -0,0 +1,238 @@ +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0FF00FF00FF00FF0, 0x0FF00FF00FF00FF0, 0x0FF00FF00FF00FF0, 0x0FF00FF00FF00FF0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA5555AAAA5555AA, 0xAA5555AAAA5555AA, 0xAA5555AAAA5555AA, 0xAA5555AAAA5555AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33CC33CC33CC33CC, 0x33CC33CC33CC33CC, 0x33CC33CC33CC33CC, 0x33CC33CC33CC33CC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5A5A5A5A5A5A5A5A, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3C3C3C3C3C3C3C3C, 0x3C3C3C3C3C3C3C3C, 0x3C3C3C3C3C3C3C3C, 0x3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555, 0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCC3333CCCC3333CC, 0xCC3333CCCC3333CC, 0xCC3333CCCC3333CC, 0xCC3333CCCC3333CC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55555555AAAAAAAA, 0x55555555AAAAAAAA, 0x55555555AAAAAAAA, 0x55555555AAAAAAAA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33333333CCCCCCCC, 0x33333333CCCCCCCC, 0x33333333CCCCCCCC, 0x33333333CCCCCCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FFFF00FF00, 0xFF00FF0000FF00FF, 0x00FF00FFFF00FF00, 0xFF00FF0000FF00FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000FFFFFFFF0000, 0x0000FFFFFFFF0000, 0x0000FFFFFFFF0000, 0x0000FFFFFFFF0000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA5555AA55AA, 0x55AA55AAAA55AA55, 0xAA55AA5555AA55AA, 0x55AA55AAAA55AA55), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC, 0x33CC33CCCC33CC33, 0x33CC33CCCC33CC33), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33, 0x33CCCC33CC3333CC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55555555AAAAAAAA, 0x55555555AAAAAAAA, 0x55555555AAAAAAAA, 0x55555555AAAAAAAA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF0000FF00FFFF00, 0xFF0000FF00FFFF00, 0x00FFFF00FF0000FF, 0x00FFFF00FF0000FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3CC33CC3C33CC33C, 0xC33CC33C3CC33CC3, 0x3CC33CC3C33CC33C, 0xC33CC33C3CC33CC3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555, 0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0FF00FF00FF00FF0, 0xF00FF00FF00FF00F, 0x0FF00FF00FF00FF0, 0xF00FF00FF00FF00F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC33333333CCCC, 0x3333CCCCCCCC3333, 0x3333CCCCCCCC3333, 0xCCCC33333333CCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF, 0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x6699669999669966, 0x9966996666996699, 0x6699669999669966, 0x9966996666996699), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA5A5A5A55A5A5A5A, 0xA5A5A5A55A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0x5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C, 0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C, 0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F, 0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33, 0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x6699669999669966, 0x9966996666996699, 0x6699669999669966, 0x9966996666996699), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5A5A5A5AA5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0xA5A5A5A55A5A5A5A, 0xA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C, 0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0, 0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C, 0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F, 0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33, 0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669699696696996, 0x9669699696696996, 0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x6996699669966996, 0x6996699669966996, 0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x6996699669966996, 0x6996699669966996, 0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F, 0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669699696696996, 0x9669699696696996, 0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x6996699669966996, 0x6996699669966996, 0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669966996699669, 0x9669966996699669, 0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF, 0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F, 0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669699696696996, 0x9669699696696996, 0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669966996699669, 0x9669966996699669, 0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669966996699669, 0x9669966996699669, 0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F, 0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669699696696996, 0x9669699696696996, 0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x9669966996699669, 0x9669966996699669, 0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x6996699669966996, 0x6996699669966996, 0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF, 0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F, 0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/controlbits.c b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/controlbits.c new file mode 100644 index 0000000000..d9cfce1437 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE348864_AVX_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE348864_AVX_sort_63b(n / 2, x); + PQCLEAN_MCELIECE348864_AVX_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE348864_AVX_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/controlbits.h b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/controlbits.h new file mode 100644 index 0000000000..2574c5f6fd --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_CONTROLBITS_H +#define PQCLEAN_MCELIECE348864_AVX_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE348864_AVX_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE348864_AVX_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/crypto_hash.h b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/crypto_hash.h new file mode 100644 index 0000000000..d3ca396a19 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE348864_AVX_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/decrypt.c b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/decrypt.c new file mode 100644 index 0000000000..c8c59d417a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/decrypt.c @@ -0,0 +1,234 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec256 out[][GFBITS], vec256 inv[][GFBITS], const unsigned char *sk, vec256 *recv) { + int i, j; + + uint64_t sk_int[ GFBITS ]; + vec256 eval[16][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + // computing inverses + + PQCLEAN_MCELIECE348864_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE348864_AVX_fft(eval, sk_int); + + for (i = 0; i < 16; i++) { + PQCLEAN_MCELIECE348864_AVX_vec256_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE348864_AVX_vec256_copy(inv[0], eval[0]); + + for (i = 1; i < 16; i++) { + PQCLEAN_MCELIECE348864_AVX_vec256_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE348864_AVX_vec256_inv(tmp, inv[15]); + + for (i = 14; i >= 0; i--) { + PQCLEAN_MCELIECE348864_AVX_vec256_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE348864_AVX_vec256_copy(inv[0], tmp); + + // + + for (i = 0; i < 16; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE348864_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + unsigned char r[ 512 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 512; i++) { + r[i] = 0; + } + + for (i = 0; i < 32; i++) { + recv[i] = PQCLEAN_MCELIECE348864_AVX_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE348864_AVX_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE348864_AVX_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE348864_AVX_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE348864_AVX_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec256 out[][GFBITS], vec256 inv[][GFBITS], vec256 *recv) { + int i, j; + + for (i = 0; i < 16; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE348864_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 32; i++) { + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE348864_AVX_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE348864_AVX_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u64( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec128 *s0, vec128 *s1) { + int i; + vec128 diff; + + diff = PQCLEAN_MCELIECE348864_AVX_vec128_xor(s0[0], s1[0]); + + for (i = 1; i < GFBITS; i++) { + diff = PQCLEAN_MCELIECE348864_AVX_vec128_or(diff, PQCLEAN_MCELIECE348864_AVX_vec128_xor(s0[i], s1[i])); + } + + return (uint16_t)PQCLEAN_MCELIECE348864_AVX_vec128_testz(diff); +} + +static void reformat_128to256(vec256 *out, vec128 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 16; i++) { + v[0] = PQCLEAN_MCELIECE348864_AVX_vec128_extract(in[2 * i + 0], 0); + v[1] = PQCLEAN_MCELIECE348864_AVX_vec128_extract(in[2 * i + 0], 1); + v[2] = PQCLEAN_MCELIECE348864_AVX_vec128_extract(in[2 * i + 1], 0); + v[3] = PQCLEAN_MCELIECE348864_AVX_vec128_extract(in[2 * i + 1], 1); + + out[i] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +static void reformat_256to128(vec128 *out, vec256 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 16; i++) { + v[0] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i], 3); + + out[2 * i + 0] = PQCLEAN_MCELIECE348864_AVX_vec128_set2x(v[0], v[1]); + out[2 * i + 1] = PQCLEAN_MCELIECE348864_AVX_vec128_set2x(v[2], v[3]); + } +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE348864_AVX_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec256 inv[ 16 ][ GFBITS ]; + vec256 scaled[ 16 ][ GFBITS ]; + vec256 eval[16][ GFBITS ]; + + vec128 error128[ 32 ]; + vec256 error256[ 16 ]; + + vec128 s_priv[ GFBITS ]; + vec128 s_priv_cmp[ GFBITS ]; + uint64_t locator[ GFBITS ]; + + vec128 recv128[ 32 ]; + vec256 recv256[ 16 ]; + vec256 allone; + + uint64_t bits_int[23][32]; + + // Berlekamp decoder + + preprocess(recv128, c); + + PQCLEAN_MCELIECE348864_AVX_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE348864_AVX_benes((uint64_t *) recv128, bits_int, 1); + + reformat_128to256(recv256, recv128); + + scaling(scaled, inv, sk, recv256); + PQCLEAN_MCELIECE348864_AVX_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE348864_AVX_bm(locator, s_priv); + + PQCLEAN_MCELIECE348864_AVX_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE348864_AVX_vec256_set1_16b(0xFFFF); + + for (i = 0; i < 16; i++) { + error256[i] = PQCLEAN_MCELIECE348864_AVX_vec256_or_reduce(eval[i]); + error256[i] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(error256[i], allone); + } + + scaling_inv(scaled, inv, error256); + PQCLEAN_MCELIECE348864_AVX_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + reformat_256to128(error128, error256); + PQCLEAN_MCELIECE348864_AVX_benes((uint64_t *) error128, bits_int, 0); + + postprocess(e, error128); + + check_weight = weight_check(e, error128); + + return 1 - (check_synd & check_weight); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/decrypt.h b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/decrypt.h new file mode 100644 index 0000000000..3b479a5324 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_DECRYPT_H +#define PQCLEAN_MCELIECE348864_AVX_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE348864_AVX_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/encrypt.c b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/encrypt.c new file mode 100644 index 0000000000..800c6d7310 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/encrypt.c @@ -0,0 +1,99 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "gf.h" +#include "int32_sort.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE348864_AVX_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + int i, j, eq, count; + + uint16_t ind[ SYS_T * 2 ]; + int32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((uint8_t *) ind, sizeof(ind)); + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + PQCLEAN_MCELIECE348864_AVX_int32_sort(ind32, SYS_T); + + eq = 0; + for (i = 1; i < SYS_T; i++) { + if (ind32[i - 1] == ind32[i]) { + eq = 1; + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE348864_AVX_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +void PQCLEAN_MCELIECE348864_AVX_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + PQCLEAN_MCELIECE348864_AVX_syndrome_asm(s, pk, e); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/encrypt.h b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/encrypt.h new file mode 100644 index 0000000000..0f54a31667 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_ENCRYPT_H +#define PQCLEAN_MCELIECE348864_AVX_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE348864_AVX_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/fft.c b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/fft.c new file mode 100644 index 0000000000..b884595629 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/fft.c @@ -0,0 +1,172 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "vec.h" + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(uint64_t *in) { + int i, j, k; + + const uint64_t mask[5][2] = { + {0x8888888888888888, 0x4444444444444444}, + {0xC0C0C0C0C0C0C0C0, 0x3030303030303030}, + {0xF000F000F000F000, 0x0F000F000F000F00}, + {0xFF000000FF000000, 0x00FF000000FF0000}, + {0xFFFF000000000000, 0x0000FFFF00000000} + }; + + const uint64_t s[5][GFBITS] = { +#include "scalars.inc" + }; + + // + + for (j = 0; j <= 4; j++) { + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + in[i] ^= (in[i] & mask[k][0]) >> (1 << k); + in[i] ^= (in[i] & mask[k][1]) >> (1 << k); + } + } + + PQCLEAN_MCELIECE348864_AVX_vec_mul(in, in, s[j]); // scaling + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec256 out[][ GFBITS ], const uint64_t *in) { + int i, j, k, s, b; + + uint64_t t0, t1, t2, t3; + + const vec256 consts[ 17 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 0; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + // boradcast + + vec256 tmp256[ GFBITS ]; + vec256 x[ GFBITS ], y[ GFBITS ]; + + for (j = 0; j < 64; j += 8) { + for (i = 0; i < GFBITS; i++) { + t0 = (in[i] >> reversal[j + 0]) & 1; + t0 = -t0; + t1 = (in[i] >> reversal[j + 2]) & 1; + t1 = -t1; + t2 = (in[i] >> reversal[j + 4]) & 1; + t2 = -t2; + t3 = (in[i] >> reversal[j + 6]) & 1; + t3 = -t3; + + out[j / 4 + 0][i] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(t0, t1, t2, t3); + + t0 = (in[i] >> reversal[j + 1]) & 1; + t0 = -t0; + t1 = (in[i] >> reversal[j + 3]) & 1; + t1 = -t1; + t2 = (in[i] >> reversal[j + 5]) & 1; + t2 = -t2; + t3 = (in[i] >> reversal[j + 7]) & 1; + t3 = -t3; + + out[j / 4 + 1][i] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(t0, t1, t2, t3); + } + } + + // + + for (i = 0; i < 16; i += 2) { + PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp256, out[i + 1], consts[ 0 ]); + + for (b = 0; b < GFBITS; b++) { + out[i + 0][b] ^= tmp256[b]; + } + for (b = 0; b < GFBITS; b++) { + out[i + 1][b] ^= out[i + 0][b]; + } + } + + for (i = 0; i < 16; i += 2) { + for (b = 0; b < GFBITS; b++) { + x[b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low_2x(out[i + 0][b], out[i + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + y[b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high_2x(out[i + 0][b], out[i + 1][b]); + } + + PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp256, y, consts[ 1 ]); + + for (b = 0; b < GFBITS; b++) { + x[b] ^= tmp256[b]; + } + for (b = 0; b < GFBITS; b++) { + y[b] ^= x[b]; + } + + for (b = 0; b < GFBITS; b++) { + out[i + 0][b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low(x[b], y[b]); + } + for (b = 0; b < GFBITS; b++) { + out[i + 1][b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high(x[b], y[b]); + } + } + + consts_ptr = 2; + + for (i = 0; i <= 3; i++) { + s = 1 << i; + + for (j = 0; j < 16; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp256, out[k + s], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] ^= tmp256[b]; + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] ^= out[k][b]; + } + } + } + + consts_ptr += s; + } + + // adding the part contributed by x^64 + + vec256 powers[16][GFBITS] = { +#include "powers.inc" + }; + + for (i = 0; i < 16; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(out[i][b], powers[i][b]); + } + } +} + +void PQCLEAN_MCELIECE348864_AVX_fft(vec256 out[][ GFBITS ], uint64_t *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/fft.h b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/fft.h new file mode 100644 index 0000000000..75a7b44fce --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/fft.h @@ -0,0 +1,18 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_FFT_H +#define PQCLEAN_MCELIECE348864_AVX_FFT_H + +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include + +#include "params.h" +#include "vec128.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE348864_AVX_fft(vec256 /*out*/[][GFBITS], uint64_t * /*in*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/fft_tr.c b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/fft_tr.c new file mode 100644 index 0000000000..23b72a7be7 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/fft_tr.c @@ -0,0 +1,355 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" +#include "vec.h" + +#include + +static void radix_conversions_tr(vec128 in[ GFBITS ]) { + int i, j, k; + + const vec128 mask[10] = { + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + uint64_t v0, v1; + + // + + for (j = 5; j >= 0; j--) { + + if (j < 5) { + PQCLEAN_MCELIECE348864_AVX_vec128_mul(in, in, s[j]); + } + + for (i = 0; i < GFBITS; i++) { + for (k = j; k <= 4; k++) { + in[i] ^= PQCLEAN_MCELIECE348864_AVX_vec128_sll_2x(in[i] & mask[2 * k + 0], 1 << k); + in[i] ^= PQCLEAN_MCELIECE348864_AVX_vec128_sll_2x(in[i] & mask[2 * k + 1], 1 << k); + } + } + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE348864_AVX_vec128_extract(in[i], 0); + v1 = PQCLEAN_MCELIECE348864_AVX_vec128_extract(in[i], 1); + + v1 ^= v0 >> 32; + v1 ^= v1 << 32; + + in[i] = PQCLEAN_MCELIECE348864_AVX_vec128_set2x(v0, v1); + } + } +} + +static void butterflies_tr(vec128 out[ GFBITS ], vec256 in[][ GFBITS ]) { + int i, j, k, s, b; + + uint64_t tmp[ GFBITS ]; + uint64_t pre[6][ GFBITS ]; + uint64_t out64[2][64]; + + vec256 p2[ 6 ]; + vec256 buf[64]; + vec256 x[ GFBITS ], y[ GFBITS ]; + vec256 tmp256[ GFBITS ]; + + const vec256 consts[ 17 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 17; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {8, 1300, 3408, 1354, 2341, 1154}; + + // butterflies + + for (i = 3; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 16; j += 2 * s) { + for (k = j; k < j + s; k++) { + for (b = 0; b < GFBITS; b++) { + in[k][b] ^= in[k + s][b]; + } + PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp256, in[k], consts[ consts_ptr + (k - j) ]); + for (b = 0; b < GFBITS; b++) { + in[k + s][b] ^= tmp256[b]; + } + } + } + } + + for (i = 0; i < 16; i += 2) { + for (b = 0; b < GFBITS; b++) { + x[b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low(in[i + 0][b], in[i + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + y[b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high(in[i + 0][b], in[i + 1][b]); + } + + for (b = 0; b < GFBITS; b++) { + x[b] ^= y[b]; + } + PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp256, x, consts[ 1 ]); + for (b = 0; b < GFBITS; b++) { + y[b] ^= tmp256[b]; + } + + for (b = 0; b < GFBITS; b++) { + in[i + 0][b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low_2x(x[b], y[b]); + } + for (b = 0; b < GFBITS; b++) { + in[i + 1][b] = PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high_2x(x[b], y[b]); + } + } + + for (i = 0; i < 16; i += 2) { + for (b = 0; b < GFBITS; b++) { + in[i + 0][b] ^= in[i + 1][b]; + } + PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp256, in[i + 0], consts[ 0 ]); + for (b = 0; b < GFBITS; b++) { + in[i + 1][b] ^= tmp256[b]; + } + } + + // transpose + + for (i = 0; i < GFBITS; i += 4) { + for (j = 0; j < 64; j += 8) { + buf[ reversal[j + 0] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 0], 0), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 1], 0), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 2], 0), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 3], 0)); + buf[ reversal[j + 1] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 0], 0), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 1], 0), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 2], 0), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 3], 0)); + buf[ reversal[j + 2] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 0], 1), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 1], 1), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 2], 1), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 3], 1)); + buf[ reversal[j + 3] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 0], 1), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 1], 1), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 2], 1), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 3], 1)); + buf[ reversal[j + 4] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 0], 2), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 1], 2), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 2], 2), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 3], 2)); + buf[ reversal[j + 5] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 0], 2), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 1], 2), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 2], 2), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 3], 2)); + buf[ reversal[j + 6] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 0], 3), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 1], 3), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 2], 3), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 0][i + 3], 3)); + buf[ reversal[j + 7] ] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 0], 3), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 1], 3), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 2], 3), + PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[j / 4 + 1][i + 3], 3)); + } + + PQCLEAN_MCELIECE348864_AVX_transpose_64x256_sp(buf); + + p2[0] = buf[32]; + buf[33] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[33], buf[32]); + p2[1] = buf[33]; + buf[35] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[35], buf[33]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[35]); + buf[34] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[34], buf[35]); + p2[2] = buf[34]; + buf[38] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[38], buf[34]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[38]); + buf[39] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[39], buf[38]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[39]); + buf[37] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[37], buf[39]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[37]); + buf[36] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[36], buf[37]); + p2[3] = buf[36]; + buf[44] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[44], buf[36]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[44]); + buf[45] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[45], buf[44]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[45]); + buf[47] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[47], buf[45]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[47]); + buf[46] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[46], buf[47]); + p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[46]); + buf[42] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[42], buf[46]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[42]); + buf[43] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[43], buf[42]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[43]); + buf[41] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[41], buf[43]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[41]); + buf[40] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[40], buf[41]); + p2[4] = buf[40]; + buf[56] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[56], buf[40]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[56]); + buf[57] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[57], buf[56]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[57]); + buf[59] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[59], buf[57]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[59]); + buf[58] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[58], buf[59]); + p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[58]); + buf[62] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[62], buf[58]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[62]); + buf[63] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[63], buf[62]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[63]); + buf[61] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[61], buf[63]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[61]); + buf[60] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[60], buf[61]); + p2[3] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[3], buf[60]); + buf[52] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[52], buf[60]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[52]); + buf[53] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[53], buf[52]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[53]); + buf[55] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[55], buf[53]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[55]); + buf[54] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[54], buf[55]); + p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[54]); + buf[50] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[50], buf[54]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[50]); + buf[51] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[51], buf[50]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[51]); + buf[49] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[49], buf[51]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[49]); + buf[48] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[48], buf[49]); + p2[5] = buf[48]; + buf[16] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[16], buf[48]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[16]); + buf[17] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[17], buf[16]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[17]); + buf[19] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[19], buf[17]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[19]); + buf[18] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[18], buf[19]); + p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[18]); + buf[22] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[22], buf[18]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[22]); + buf[23] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[23], buf[22]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[23]); + buf[21] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[21], buf[23]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[21]); + buf[20] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[20], buf[21]); + p2[3] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[3], buf[20]); + buf[28] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[28], buf[20]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[28]); + buf[29] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[29], buf[28]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[29]); + buf[31] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[31], buf[29]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[31]); + buf[30] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[30], buf[31]); + p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[30]); + buf[26] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[26], buf[30]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[26]); + buf[27] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[27], buf[26]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[27]); + buf[25] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[25], buf[27]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[25]); + buf[24] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[24], buf[25]); + p2[4] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[4], buf[24]); + buf[8] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[8], buf[24]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[8]); + buf[9] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[9], buf[8]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[9]); + buf[11] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[11], buf[9]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[11]); + buf[10] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[10], buf[11]); + p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[10]); + buf[14] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[14], buf[10]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[14]); + buf[15] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[15], buf[14]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[15]); + buf[13] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[13], buf[15]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[13]); + buf[12] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[12], buf[13]); + p2[3] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[3], buf[12]); + buf[4] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[4], buf[12]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[4]); + buf[5] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[5], buf[4]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[5]); + buf[7] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[7], buf[5]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[7]); + buf[6] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[6], buf[7]); + p2[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[2], buf[6]); + buf[2] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[2], buf[6]); + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[2]); + buf[3] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[3], buf[2]); + p2[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[1], buf[3]); + buf[1] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[1], buf[3]); + + p2[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(p2[0], buf[1]); + buf[0] = PQCLEAN_MCELIECE348864_AVX_vec256_xor(buf[0], buf[1]); + + for (j = 0; j < 6; j++) { + pre[j][i + 0] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(p2[j], 0); + pre[j][i + 1] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(p2[j], 1); + pre[j][i + 2] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(p2[j], 2); + pre[j][i + 3] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(p2[j], 3); + } + + out64[0][i + 0] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(buf[0], 0); + out64[0][i + 1] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(buf[0], 1); + out64[0][i + 2] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(buf[0], 2); + out64[0][i + 3] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(buf[0], 3); + } + + // + + for (j = 0; j < GFBITS; j++) { + tmp[j] = (beta[0] >> j) & 1; + tmp[j] = -tmp[j]; + } + + PQCLEAN_MCELIECE348864_AVX_vec_mul(out64[1], pre[0], tmp); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = (beta[i] >> j) & 1; + tmp[j] = -tmp[j]; + } + + PQCLEAN_MCELIECE348864_AVX_vec_mul(tmp, pre[i], tmp); + PQCLEAN_MCELIECE348864_AVX_vec_add(out64[1], out64[1], tmp); + } + + for (i = 0; i < GFBITS; i++) { + out[i] = PQCLEAN_MCELIECE348864_AVX_vec128_set2x(out64[0][i], out64[1][i]); + } +} + +void PQCLEAN_MCELIECE348864_AVX_fft_tr(vec128 out[GFBITS], vec256 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/fft_tr.h b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/fft_tr.h new file mode 100644 index 0000000000..5442858fe4 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_FFT_TR_H +#define PQCLEAN_MCELIECE348864_AVX_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE348864_AVX_fft_tr(vec128 * /*out*/, vec256 /*in*/[][ GFBITS ]); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/gf.c b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/gf.c new file mode 100644 index 0000000000..dca11df822 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/gf.c @@ -0,0 +1,169 @@ +/* + This file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +gf PQCLEAN_MCELIECE348864_AVX_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 20; + + return (gf) t; +} + +gf PQCLEAN_MCELIECE348864_AVX_gf_add(gf in0, gf in1) { + return in0 ^ in1; +} + +gf PQCLEAN_MCELIECE348864_AVX_gf_mul(gf in0, gf in1) { + int i; + + uint32_t tmp; + uint32_t t0; + uint32_t t1; + uint32_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & (1 << i))); + } + + t = tmp & 0x7FC000; + tmp ^= t >> 9; + tmp ^= t >> 12; + + t = tmp & 0x3000; + tmp ^= t >> 9; + tmp ^= t >> 12; + + return tmp & ((1 << GFBITS) - 1); +} + +/* input: field element in */ +/* return: in^2 */ +static inline gf gf_sq(gf in) { + const uint32_t B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF}; + + uint32_t x = in; + uint32_t t; + + x = (x | (x << 8)) & B[3]; + x = (x | (x << 4)) & B[2]; + x = (x | (x << 2)) & B[1]; + x = (x | (x << 1)) & B[0]; + + t = x & 0x7FC000; + x ^= t >> 9; + x ^= t >> 12; + + t = x & 0x3000; + x ^= t >> 9; + x ^= t >> 12; + + return x & ((1 << GFBITS) - 1); +} + +gf PQCLEAN_MCELIECE348864_AVX_gf_inv(gf in) { + gf tmp_11; + gf tmp_1111; + + gf out = in; + + out = gf_sq(out); + tmp_11 = PQCLEAN_MCELIECE348864_AVX_gf_mul(out, in); // 11 + + out = gf_sq(tmp_11); + out = gf_sq(out); + tmp_1111 = PQCLEAN_MCELIECE348864_AVX_gf_mul(out, tmp_11); // 1111 + + out = gf_sq(tmp_1111); + out = gf_sq(out); + out = gf_sq(out); + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864_AVX_gf_mul(out, tmp_1111); // 11111111 + + out = gf_sq(out); + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864_AVX_gf_mul(out, tmp_11); // 1111111111 + + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864_AVX_gf_mul(out, in); // 11111111111 + + return gf_sq(out); // 111111111110 +} + +/* input: field element den, num */ +/* return: (num/den) */ +gf PQCLEAN_MCELIECE348864_AVX_gf_frac(gf den, gf num) { + return PQCLEAN_MCELIECE348864_AVX_gf_mul(PQCLEAN_MCELIECE348864_AVX_gf_inv(den), num); +} + +/* input: in0, in1 in GF((2^m)^t)*/ +/* output: out = in0*in1 */ +void PQCLEAN_MCELIECE348864_AVX_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[ SYS_T * 2 - 1 ]; + + for (i = 0; i < SYS_T * 2 - 1; i++) { + prod[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_T; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE348864_AVX_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = (SYS_T - 1) * 2; i >= SYS_T; i--) { + prod[i - SYS_T + 9] ^= PQCLEAN_MCELIECE348864_AVX_gf_mul(prod[i], (gf) 877); + prod[i - SYS_T + 7] ^= PQCLEAN_MCELIECE348864_AVX_gf_mul(prod[i], (gf) 2888); + prod[i - SYS_T + 5] ^= PQCLEAN_MCELIECE348864_AVX_gf_mul(prod[i], (gf) 1781); + prod[i - SYS_T + 0] ^= PQCLEAN_MCELIECE348864_AVX_gf_mul(prod[i], (gf) 373); + } + + for (i = 0; i < SYS_T; i++) { + out[i] = prod[i]; + } +} + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE348864_AVX_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x007FC000007FC000; + tmp ^= (t >> 9) ^ (t >> 12); + + t = tmp & 0x0000300000003000; + tmp ^= (t >> 9) ^ (t >> 12); + + return tmp & 0x00000FFF00000FFF; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/gf.h b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/gf.h new file mode 100644 index 0000000000..3afe96ecc5 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/gf.h @@ -0,0 +1,26 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_GF_H +#define PQCLEAN_MCELIECE348864_AVX_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include "params.h" + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE348864_AVX_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE348864_AVX_gf_add(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE348864_AVX_gf_mul(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE348864_AVX_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE348864_AVX_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE348864_AVX_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE348864_AVX_gf_mul2(gf a, gf b0, gf b1); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/int32_sort.c b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/int32_sort.c new file mode 100644 index 0000000000..f984819f1a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/int32_sort.c @@ -0,0 +1,1211 @@ +#include "int32_sort.h" +#define int32 int32_t + +#include + +typedef __m256i int32x8; +#define int32x8_load(z) _mm256_loadu_si256((__m256i_u *) (z)) +#define int32x8_store(z,i) _mm256_storeu_si256((__m256i_u *) (z),(i)) +#define int32x8_min _mm256_min_epi32 +#define int32x8_max _mm256_max_epi32 + +#define int32x8_MINMAX(a,b) \ + do { \ + int32x8 c = int32x8_min(a, b); \ + (b) = int32x8_max(a,b); \ + (a) = c; \ + } while(0) + +static inline void int32_MINMAX(int32 *a, int32 *b) { + int32 ab = *b ^ *a; + int32 c = *b - *a; + c ^= ab & (c ^ *b); + c >>= 31; + c &= ab; + *a ^= c; + *b ^= c; +} + +static void minmax_vector(int32 *x, int32 *y, size_t n) { + if (n < 8) { + while (n > 0) { + int32_MINMAX(x, y); + ++x; + ++y; + --n; + } + return; + } + if (n & 7) { + int32x8 x0 = int32x8_load(x + n - 8); + int32x8 y0 = int32x8_load(y + n - 8); + int32x8_MINMAX(x0, y0); + int32x8_store(x + n - 8, x0); + int32x8_store(y + n - 8, y0); + n &= ~7; + } + do { + int32x8 x0 = int32x8_load(x); + int32x8 y0 = int32x8_load(y); + int32x8_MINMAX(x0, y0); + int32x8_store(x, x0); + int32x8_store(y, y0); + x += 8; + y += 8; + n -= 8; + } while (n); +} + +/* stages 8,4,2,1 of size-16 bitonic merging */ +static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) { + int32x8 b0, b1, c0, c1, mask; + + int32x8_MINMAX(x0, x1); + + b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + + int32x8_MINMAX(b0, b1); + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */ + + x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */ + x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */ + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + } + + int32x8_store(&x[0], x0); + int32x8_store(&x[8], x1); +} + +/* stages 64,32 of bitonic merging; n is multiple of 128 */ +static void int32_twostages_32(int32 *x, size_t n) { + size_t i; + + while (n > 0) { + for (i = 0; i < 32; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + 32]); + int32x8 x2 = int32x8_load(&x[i + 64]); + int32x8 x3 = int32x8_load(&x[i + 96]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 32], x1); + int32x8_store(&x[i + 64], x2); + int32x8_store(&x[i + 96], x3); + } + x += 128; + n -= 128; + } +} + +/* stages 4q,2q,q of bitonic merging */ +static size_t int32_threestages(int32 *x, size_t n, size_t q) { + size_t k, i; + + for (k = 0; k + 8 * q <= n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + + return k; +} + +/* n is a power of 2; n >= 8; if n == 8 then flagdown */ +// NOLINTNEXTLINE(google-readability-function-size) +static void int32_sort_2power(int32 *x, size_t n, int flagdown) { + size_t p, q, i, j, k; + int32x8 mask; + + if (n == 8) { + int32 x0 = x[0]; + int32 x1 = x[1]; + int32 x2 = x[2]; + int32 x3 = x[3]; + int32 x4 = x[4]; + int32 x5 = x[5]; + int32 x6 = x[6]; + int32 x7 = x[7]; + + /* odd-even sort instead of bitonic sort */ + + int32_MINMAX(&x1, &x0); + int32_MINMAX(&x3, &x2); + int32_MINMAX(&x2, &x0); + int32_MINMAX(&x3, &x1); + int32_MINMAX(&x2, &x1); + + int32_MINMAX(&x5, &x4); + int32_MINMAX(&x7, &x6); + int32_MINMAX(&x6, &x4); + int32_MINMAX(&x7, &x5); + int32_MINMAX(&x6, &x5); + + int32_MINMAX(&x4, &x0); + int32_MINMAX(&x6, &x2); + int32_MINMAX(&x4, &x2); + + int32_MINMAX(&x5, &x1); + int32_MINMAX(&x7, &x3); + int32_MINMAX(&x5, &x3); + + int32_MINMAX(&x2, &x1); + int32_MINMAX(&x4, &x3); + int32_MINMAX(&x6, &x5); + + x[0] = x0; + x[1] = x1; + x[2] = x2; + x[3] = x3; + x[4] = x4; + x[5] = x5; + x[6] = x6; + x[7] = x7; + return; + } + + if (n == 16) { + int32x8 x0, x1, b0, b1, c0, c1; + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + + mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1); + + x0 ^= mask; /* A01234567 */ + x1 ^= mask; /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + c0 ^= mask; + c1 ^= mask; + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + b0 ^= mask; + b1 ^= mask; + + c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */ + c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */ + b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + mask = _mm256_set1_epi32(-1); + if (flagdown) { + x1 ^= mask; + } else { + x0 ^= mask; + } + + merge16_finish(x, x0, x1, flagdown); + return; + } + + if (n == 32) { + int32x8 x0, x1, x2, x3; + + int32_sort_2power(x, 16, 1); + int32_sort_2power(x + 16, 16, 0); + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + x2 = int32x8_load(&x[16]); + x3 = int32x8_load(&x[24]); + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + } + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + + merge16_finish(x, x0, x1, flagdown); + merge16_finish(x + 16, x2, x3, flagdown); + return; + } + + p = n >> 3; + for (i = 0; i < p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x2 = int32x8_load(&x[i + 2 * p]); + int32x8 x4 = int32x8_load(&x[i + 4 * p]); + int32x8 x6 = int32x8_load(&x[i + 6 * p]); + + /* odd-even stage instead of bitonic stage */ + + int32x8_MINMAX(x4, x0); + int32x8_MINMAX(x6, x2); + int32x8_MINMAX(x2, x0); + int32x8_MINMAX(x6, x4); + int32x8_MINMAX(x2, x4); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 2 * p], x2); + int32x8_store(&x[i + 4 * p], x4); + int32x8_store(&x[i + 6 * p], x6); + + int32x8 x1 = int32x8_load(&x[i + p]); + int32x8 x3 = int32x8_load(&x[i + 3 * p]); + int32x8 x5 = int32x8_load(&x[i + 5 * p]); + int32x8 x7 = int32x8_load(&x[i + 7 * p]); + + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x5, x3); + + int32x8_store(&x[i + p], x1); + int32x8_store(&x[i + 3 * p], x3); + int32x8_store(&x[i + 5 * p], x5); + int32x8_store(&x[i + 7 * p], x7); + } + + if (n >= 128) { + int flip, flipflip; + + mask = _mm256_set1_epi32(-1); + + for (j = 0; j < n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 16]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 16], x1); + } + + p = 8; + for (;;) { /* for p in [8, 16, ..., n/16] */ + q = p >> 1; + while (q >= 128) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + if (q == 64) { + int32_twostages_32(x, n); + q = 16; + } + if (q == 32) { + q = 8; + for (k = 0; k < n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q = 4; + } + if (q == 16) { + q = 8; + for (k = 0; k < n; k += 4 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q = 4; + } + if (q == 8) { + for (k = 0; k < n; k += q + q) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + } + } + + q = n >> 3; + flip = 0; + if (p << 1 == q) { + flip = 1; + } + flipflip = 1 - flip; + for (j = 0; j < q; j += p + p) { + for (k = j; k < j + p + p; k += p) { + for (i = k; i < k + p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + if (flip) { + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + x4 ^= mask; + x5 ^= mask; + x6 ^= mask; + x7 ^= mask; + } + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + flip ^= 1; + } + flip ^= flipflip; + } + + if (p << 4 == n) { + break; + } + p <<= 1; + } + } + + for (p = 4; p >= 1; p >>= 1) { + int32 *z = x; + int32 *target = x + n; + if (p == 4) { + mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&z[0], x0); + int32x8_store(&z[8], x1); + z += 16; + } + } else if (p == 2) { + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8_MINMAX(b0, b1); + int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20); + int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31); + int32x8_store(&z[0], c0); + int32x8_store(&z[8], c1); + z += 16; + } + } else { /* p == 1 */ + mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */ + int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */ + int32x8_MINMAX(d0, d1); + int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20); + int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31); + int32x8_store(&z[0], e0); + int32x8_store(&z[8], e1); + z += 16; + } + } + + q = n >> 4; + while (q >= 128 || q == 32) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (k = j; k < j + q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += 2 * q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (k = 0; k < q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + int32x8 x4 = int32x8_load(&x[k + 4 * q]); + int32x8 x5 = int32x8_load(&x[k + 5 * q]); + int32x8 x6 = int32x8_load(&x[k + 6 * q]); + int32x8 x7 = int32x8_load(&x[k + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + int32x8_store(&x[k + 4 * q], x4); + int32x8_store(&x[k + 5 * q], x5); + int32x8_store(&x[k + 6 * q], x6); + int32x8_store(&x[k + 7 * q], x7); + } + } + + /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */ + mask = _mm256_set1_epi32(-1); + + for (i = 0; i < n; i += 64) { + int32x8 a0 = int32x8_load(&x[i]); + int32x8 a1 = int32x8_load(&x[i + 8]); + int32x8 a2 = int32x8_load(&x[i + 16]); + int32x8 a3 = int32x8_load(&x[i + 24]); + int32x8 a4 = int32x8_load(&x[i + 32]); + int32x8 a5 = int32x8_load(&x[i + 40]); + int32x8 a6 = int32x8_load(&x[i + 48]); + int32x8 a7 = int32x8_load(&x[i + 56]); + + int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */ + int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */ + int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */ + int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */ + int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */ + int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */ + int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */ + + if (flagdown) { + c2 ^= mask; + c3 ^= mask; + c6 ^= mask; + c7 ^= mask; + } else { + c0 ^= mask; + c1 ^= mask; + c4 ^= mask; + c5 ^= mask; + } + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */ + int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */ + int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */ + + int32x8_MINMAX(d0, d1); + int32x8_MINMAX(d2, d3); + int32x8_MINMAX(d4, d5); + int32x8_MINMAX(d6, d7); + int32x8_MINMAX(d0, d2); + int32x8_MINMAX(d1, d3); + int32x8_MINMAX(d4, d6); + int32x8_MINMAX(d5, d7); + int32x8_MINMAX(d0, d4); + int32x8_MINMAX(d1, d5); + int32x8_MINMAX(d2, d6); + int32x8_MINMAX(d3, d7); + + int32x8 e0 = _mm256_unpacklo_epi32(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi32(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi32(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi32(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi32(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi32(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi32(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi32(d6, d7); + + int32x8 f0 = _mm256_unpacklo_epi64(e0, e2); + int32x8 f1 = _mm256_unpacklo_epi64(e1, e3); + int32x8 f2 = _mm256_unpackhi_epi64(e0, e2); + int32x8 f3 = _mm256_unpackhi_epi64(e1, e3); + int32x8 f4 = _mm256_unpacklo_epi64(e4, e6); + int32x8 f5 = _mm256_unpacklo_epi64(e5, e7); + int32x8 f6 = _mm256_unpackhi_epi64(e4, e6); + int32x8 f7 = _mm256_unpackhi_epi64(e5, e7); + + int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20); + int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20); + int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20); + int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20); + int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31); + int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31); + int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31); + int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31); + + int32x8_store(&x[i], g0); + int32x8_store(&x[i + 8], g1); + int32x8_store(&x[i + 16], g2); + int32x8_store(&x[i + 24], g3); + int32x8_store(&x[i + 32], g4); + int32x8_store(&x[i + 40], g5); + int32x8_store(&x[i + 48], g6); + int32x8_store(&x[i + 56], g7); + } + + q = n >> 4; + while (q >= 128 || q == 32) { + q >>= 2; + for (j = 0; j < n; j += 8 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q >>= 1; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += q + q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (i = 0; i < q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */ + int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */ + int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */ + int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */ + int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */ + int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */ + int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */ + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */ + int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */ + int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */ + + if (flagdown) { + d0 ^= mask; + d1 ^= mask; + d2 ^= mask; + d3 ^= mask; + d4 ^= mask; + d5 ^= mask; + d6 ^= mask; + d7 ^= mask; + } + + int32x8_store(&x[i], d0); + int32x8_store(&x[i + q], d4); + int32x8_store(&x[i + 2 * q], d1); + int32x8_store(&x[i + 3 * q], d5); + int32x8_store(&x[i + 4 * q], d2); + int32x8_store(&x[i + 5 * q], d6); + int32x8_store(&x[i + 6 * q], d3); + int32x8_store(&x[i + 7 * q], d7); + } +} + +void PQCLEAN_MCELIECE348864_AVX_int32_sort(int32 *x, size_t n) { + size_t q, i, j; + + if (n <= 8) { + if (n == 8) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + int32_MINMAX(&x[6], &x[7]); + } + if (n >= 7) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + } + if (n >= 6) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + } + if (n >= 5) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + } + if (n >= 4) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + } + if (n >= 3) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + } + if (n >= 2) { + int32_MINMAX(&x[0], &x[1]); + } + return; + } + + if (!(n & (n - 1))) { + int32_sort_2power(x, n, 0); + return; + } + + q = 8; + while (q < n - q) { + q += q; + } + /* n > q >= 8 */ + + if (q <= 128) { /* n <= 256 */ + int32x8 y[32]; + for (i = q >> 3; i < q >> 2; ++i) { + y[i] = _mm256_set1_epi32(0x7fffffff); + } + for (i = 0; i < n; ++i) { + ((int32 *)y)[i] = x[i]; + } + int32_sort_2power((int32 *) y, 2 * q, 0); + for (i = 0; i < n; ++i) { + x[i] = ((int32 *) y)[i]; + } + return; + } + + int32_sort_2power(x, q, 1); + PQCLEAN_MCELIECE348864_AVX_int32_sort(x + q, n - q); + + while (q >= 64) { + q >>= 2; + j = int32_threestages(x, n, q); + minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j); + if (j + 4 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + j += 4 * q; + } + minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j); + if (j + 2 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + } + j += 2 * q; + } + minmax_vector(x + j, x + j + q, n - q - j); + q >>= 1; + } + if (q == 32) { + j = 0; + for (; j + 64 <= n; j += 64) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8 x4 = int32x8_load(&x[j + 32]); + int32x8 x5 = int32x8_load(&x[j + 40]); + int32x8 x6 = int32x8_load(&x[j + 48]); + int32x8 x7 = int32x8_load(&x[j + 56]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20); + int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31); + int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20); + int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8_MINMAX(a4, a5); + int32x8_MINMAX(a6, a7); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20); + int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31); + int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20); + int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8 c4 = _mm256_unpacklo_epi64(b4, b5); + int32x8 c5 = _mm256_unpackhi_epi64(b4, b5); + int32x8 c6 = _mm256_unpacklo_epi64(b6, b7); + int32x8 c7 = _mm256_unpackhi_epi64(b6, b7); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8_MINMAX(c4, c5); + int32x8_MINMAX(c6, c7); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 d4 = _mm256_unpacklo_epi32(c4, c5); + int32x8 d5 = _mm256_unpackhi_epi32(c4, c5); + int32x8 d6 = _mm256_unpacklo_epi32(c6, c7); + int32x8 d7 = _mm256_unpackhi_epi32(c6, c7); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi64(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi64(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi64(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi64(d6, d7); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8_MINMAX(e4, e5); + int32x8_MINMAX(e6, e7); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8 f4 = _mm256_unpacklo_epi32(e4, e5); + int32x8 f5 = _mm256_unpackhi_epi32(e4, e5); + int32x8 f6 = _mm256_unpacklo_epi32(e6, e7); + int32x8 f7 = _mm256_unpackhi_epi32(e6, e7); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + int32x8_store(&x[j + 32], f4); + int32x8_store(&x[j + 40], f5); + int32x8_store(&x[j + 48], f6); + int32x8_store(&x[j + 56], f7); + } + minmax_vector(x + j, x + j + 32, n - 32 - j); + goto continue16; + } + if (q == 16) { + j = 0; +continue16: + for (; j + 32 <= n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + } + minmax_vector(x + j, x + j + 16, n - 16 - j); + goto continue8; + } + /* q == 8 */ + j = 0; +continue8: + for (; j + 16 <= n; j += 16) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 8], x1); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */ + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */ + int32x8_MINMAX(a0, a1); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */ + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */ + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */ + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */ + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */ + int32x8_MINMAX(e0, e1); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */ + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */ + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + } + minmax_vector(x + j, x + j + 8, n - 8 - j); + if (j + 8 <= n) { + int32_MINMAX(&x[j], &x[j + 4]); + int32_MINMAX(&x[j + 1], &x[j + 5]); + int32_MINMAX(&x[j + 2], &x[j + 6]); + int32_MINMAX(&x[j + 3], &x[j + 7]); + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + int32_MINMAX(&x[j + 4], &x[j + 6]); + int32_MINMAX(&x[j + 5], &x[j + 7]); + int32_MINMAX(&x[j + 4], &x[j + 5]); + int32_MINMAX(&x[j + 6], &x[j + 7]); + j += 8; + } + minmax_vector(x + j, x + j + 4, n - 4 - j); + if (j + 4 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + j += 4; + } + if (j + 3 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + } + if (j + 2 <= n) { + int32_MINMAX(&x[j], &x[j + 1]); + } +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/int32_sort.h b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/int32_sort.h new file mode 100644 index 0000000000..36a5034d1f --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/int32_sort.h @@ -0,0 +1,9 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_INT32_SORT_H +#define PQCLEAN_MCELIECE348864_AVX_INT32_SORT_H + +#include +#include + +void PQCLEAN_MCELIECE348864_AVX_int32_sort(int32_t *x, size_t n); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/operations.c b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/operations.c new file mode 100644 index 0000000000..cffd284939 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE348864_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE348864_AVX_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE348864_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE348864_AVX_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE348864_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE348864_AVX_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE348864_AVX_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE348864_AVX_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE348864_AVX_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE348864_AVX_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE348864_AVX_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE348864_AVX_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE348864_AVX_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/params.h b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/params.h new file mode 100644 index 0000000000..f45ee11e2e --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_PARAMS_H +#define PQCLEAN_MCELIECE348864_AVX_PARAMS_H + +#define GFBITS 12 +#define SYS_N 3488 +#define SYS_T 64 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/pk_gen.c b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/pk_gen.c new file mode 100644 index 0000000000..81d46f695a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/pk_gen.c @@ -0,0 +1,276 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +static void de_bitslicing(uint64_t *out, vec256 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 16; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 256 + 0 * 64 + r] <<= 1; + out[i * 256 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 256 + 1 * 64 + r] <<= 1; + out[i * 256 + 1 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i][j], 2); + for (r = 0; r < 64; r++) { + out[i * 256 + 2 * 64 + r] <<= 1; + out[i * 256 + 2 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE348864_AVX_vec256_extract(in[i][j], 3); + for (r = 0; r < 64; r++) { + out[i * 256 + 3 * 64 + r] <<= 1; + out[i * 256 + 3 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec256 out0[][GFBITS], vec256 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[4] = {0}; + + for (i = 0; i < 16; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE348864_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + } +} + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 255) / 256) +#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) +#define NBLOCKS2_I ((GFBITS * SYS_T + 255) / 256) +int PQCLEAN_MCELIECE348864_AVX_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + const int block_idx = NBLOCKS1_I; + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 4 ]; + uint64_t ops[ GFBITS * SYS_T ][ NBLOCKS1_I ]; + + uint64_t mask; + + uint64_t sk_int[ GFBITS ]; + + vec256 consts[ 16 ][ GFBITS ]; + vec256 eval[ 16 ][ GFBITS ]; + vec256 prod[ 16 ][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + uint64_t one_row[ 128 ]; + + // compute the inverses + + PQCLEAN_MCELIECE348864_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE348864_AVX_fft(eval, sk_int); + + PQCLEAN_MCELIECE348864_AVX_vec256_copy(prod[0], eval[0]); + + for (i = 1; i < 16; i++) { + PQCLEAN_MCELIECE348864_AVX_vec256_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE348864_AVX_vec256_inv(tmp, prod[15]); + + for (i = 14; i >= 0; i--) { + PQCLEAN_MCELIECE348864_AVX_vec256_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE348864_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE348864_AVX_vec256_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE348864_AVX_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_I; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_I; j++) { + PQCLEAN_MCELIECE348864_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + // gaussian elimination to obtain an upper triangular matrix + // and keep track of the operations in ops + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < NBLOCKS1_I; j++) { + ops[ i ][ j ] = 0; + } + } + + for (i = 0; i < PK_NROWS; i++) { + ops[ i ][ i / 64 ] = 1; + ops[ i ][ i / 64 ] <<= (i % 64); + } + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_I; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + ops[ row ][ c ] ^= ops[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_I; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // computing the lineaer map required to obatin the systematic form + + for (row = PK_NROWS - 1; row >= 0; row--) { + for (k = 0; k < row; k++) { + mask = mat[ k ][ row / 64 ] >> (row & 63); + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_I; c++) { + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // apply the linear map to the non-systematic part + + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE348864_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE348864_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + for (row = 0; row < PK_NROWS; row++) { + for (k = 0; k < NBLOCKS1_H; k++) { + one_row[ k ] = 0; + } + + for (c = 0; c < PK_NROWS; c++) { + mask = ops[ row ][ c >> 6 ] >> (c & 63); + mask &= 1; + mask = -mask; + + for (k = block_idx; k < NBLOCKS1_H; k++) { + one_row[ k ] ^= mat[ c ][ k ] & mask; + } + } + + for (k = block_idx; k < NBLOCKS1_H - 1; k++) { + PQCLEAN_MCELIECE348864_AVX_store8(pk, one_row[k]); + pk += 8; + } + + PQCLEAN_MCELIECE348864_AVX_store_i(pk, one_row[k], PK_ROW_BYTES % 8); + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/pk_gen.h b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/pk_gen.h new file mode 100644 index 0000000000..e3a61a0b8d --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/pk_gen.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_PK_GEN_H +#define PQCLEAN_MCELIECE348864_AVX_PK_GEN_H +/* + This file is for public-key generation +*/ + + +#include "gf.h" + +int PQCLEAN_MCELIECE348864_AVX_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/powers.inc b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/powers.inc new file mode 100644 index 0000000000..03e8349b8d --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/powers.inc @@ -0,0 +1,224 @@ +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/scalars.inc b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/scalars.inc new file mode 100644 index 0000000000..aa8f64b951 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/scalars.inc @@ -0,0 +1,70 @@ +{ + 0XF3CFC030FC30F003, + 0X3FCF0F003C00C00C, + 0X30033CC300C0C03C, + 0XCCFF0F3C0F30F0C0, + 0X0300C03FF303C3F0, + 0X3FFF3C0FF0CCCCC0, + 0XF3FFF0C00F3C3CC0, + 0X3003333FFFC3C000, + 0X0FF30FFFC3FFF300, + 0XFFC0F300F0F0CC00, + 0XC0CFF3FCCC3CFC00, + 0XFC3C03F0F330C000, +}, +{ + 0X000F00000000F00F, + 0X00000F00F00000F0, + 0X0F00000F00000F00, + 0XF00F00F00F000000, + 0X00F00000000000F0, + 0X0000000F00000000, + 0XF00000000F00F000, + 0X00F00F00000F0000, + 0X0000F00000F00F00, + 0X000F00F00F00F000, + 0X00F00F0000000000, + 0X0000000000F00000, +}, +{ + 0X0000FF00FF0000FF, + 0X0000FF000000FF00, + 0XFF0000FF00FF0000, + 0XFFFF0000FF000000, + 0X00FF00FF00FF0000, + 0X0000FFFFFF000000, + 0X00FFFF00FF000000, + 0XFFFFFF0000FF0000, + 0XFFFF00FFFF00FF00, + 0X0000FF0000000000, + 0XFFFFFF00FF000000, + 0X00FF000000000000, +}, +{ + 0X000000000000FFFF, + 0X00000000FFFF0000, + 0X0000000000000000, + 0XFFFF000000000000, + 0X00000000FFFF0000, + 0X0000FFFF00000000, + 0X0000000000000000, + 0X00000000FFFF0000, + 0X0000FFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, +}, +{ + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/scalars_2x.inc b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/scalars_2x.inc new file mode 100644 index 0000000000..604ec6b063 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/scalars_2x.inc @@ -0,0 +1,70 @@ +{ + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xf3cfc030fc30f003, 0x000c03c0c3c0330c), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x3fcf0f003c00c00c, 0xf330cffcc00f33c0), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x30033cc300c0c03c, 0xccf330f00f3c0333), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xccff0f3c0f30f0c0, 0xff03fff3ff0cf0c0), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0300c03ff303c3f0, 0x3cc3fcf00fcc303c), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x3fff3c0ff0ccccc0, 0x0f000c0fc30303f3), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xf3fff0c00f3c3cc0, 0xcf0fc3ff333ccf3c), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x3003333fffc3c000, 0x003f3fc3c0ff333f), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0ff30fffc3fff300, 0x3cc3f0f3cf0ff00f), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffc0f300f0f0cc00, 0xf3f33cc03fc30cc0), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xc0cff3fccc3cfc00, 0x3cc330cfc333f33f), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xfc3c03f0f330c000, 0x3cc0303ff3c3fffc), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x000f00000000f00f, 0x0f00f00f00000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00000f00f00000f0, 0xf00000000000f000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0f00000f00000f00, 0x00000f00000000f0), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xf00f00f00f000000, 0x0f00f00000f00000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00f00000000000f0, 0x000f00000f00f00f), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000f00000000, 0x00f00f00f00f0000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xf00000000f00f000, 0x0f00f00000000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00f00f00000f0000, 0x000000000f000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000f00000f00f00, 0x00f00000000f00f0), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x000f00f00f00f000, 0x0000f00f00000f00), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00f00f0000000000, 0xf00000f00000f00f), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000f00000, 0x00000f00f00f00f0), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000ff00ff0000ff, 0xff00ffffff000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000ff000000ff00, 0xff0000ffff000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xff0000ff00ff0000, 0xffff00ffff000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffff0000ff000000, 0xff00ffffffffff00), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00ff00ff00ff0000, 0x00000000ff00ff00), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000ffffff000000, 0xffffffff00ff0000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00ffff00ff000000, 0x00ffffff00ff0000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffffff0000ff0000, 0xffff00ffff00ffff), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffff00ffff00ff00, 0xffff0000ffffffff), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000ff0000000000, 0xff00000000ff0000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffffff00ff000000, 0x000000ff00ff00ff), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00ff000000000000, 0x00ff00ff00ffff00), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x000000000000ffff, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00000000ffff0000, 0xffff000000000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffff000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00000000ffff0000, 0xffff00000000ffff), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000ffff00000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0x0000ffff00000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00000000ffff0000, 0xffff00000000ffff), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000ffff00000000, 0x00000000ffff0000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffff00000000ffff), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0x00000000ffff0000), +}, +{ + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x00000000ffffffff, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffffffff00000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffffffff00000000, 0x00000000ffffffff), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffffffff00000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffffffffffffffff), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0xffffffff00000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffffffffffffffff), + PQCLEAN_MCELIECE348864_AVX_vec128_set2x(0x0000000000000000, 0xffffffff00000000), +}, diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/sk_gen.c b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/sk_gen.c new file mode 100644 index 0000000000..e30ed8a237 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE348864_AVX_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE348864_AVX_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE348864_AVX_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE348864_AVX_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE348864_AVX_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE348864_AVX_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE348864_AVX_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE348864_AVX_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/sk_gen.h b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/sk_gen.h new file mode 100644 index 0000000000..5baa544d5c --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_SK_GEN_H +#define PQCLEAN_MCELIECE348864_AVX_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE348864_AVX_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE348864_AVX_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/syndrome_asm.S b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/syndrome_asm.S new file mode 100644 index 0000000000..c77b629cb7 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/syndrome_asm.S @@ -0,0 +1,530 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: reg256 pp + +# qhasm: reg256 ee + +# qhasm: reg256 ss + +# qhasm: int64 buf_ptr + +# qhasm: stack256 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864_AVX_syndrome_asm +.global PQCLEAN_MCELIECE348864_AVX_syndrome_asm +_PQCLEAN_MCELIECE348864_AVX_syndrome_asm: +PQCLEAN_MCELIECE348864_AVX_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 260780 +# asm 1: add $260780,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 768 +# asm 1: mov $768,>row=int64#5 +# asm 2: mov $768,>row=%r8 +mov $768,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 96 ] +# asm 1: vmovupd 96(ee=reg256#2 +# asm 2: vmovupd 96(ee=%ymm1 +vmovupd 96(%rdx),%ymm1 + +# qhasm: ss &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 32(pp=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 128 ] +# asm 1: vmovupd 128(ee=reg256#3 +# asm 2: vmovupd 128(ee=%ymm2 +vmovupd 128(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 64(pp=%ymm1 +vmovupd 64(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 160 ] +# asm 1: vmovupd 160(ee=reg256#3 +# asm 2: vmovupd 160(ee=%ymm2 +vmovupd 160(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 96(pp=%ymm1 +vmovupd 96(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 192 ] +# asm 1: vmovupd 192(ee=reg256#3 +# asm 2: vmovupd 192(ee=%ymm2 +vmovupd 192(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 128(pp=%ymm1 +vmovupd 128(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 224 ] +# asm 1: vmovupd 224(ee=reg256#3 +# asm 2: vmovupd 224(ee=%ymm2 +vmovupd 224(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 160(pp=%ymm1 +vmovupd 160(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 256 ] +# asm 1: vmovupd 256(ee=reg256#3 +# asm 2: vmovupd 256(ee=%ymm2 +vmovupd 256(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 192(pp=%ymm1 +vmovupd 192(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 288 ] +# asm 1: vmovupd 288(ee=reg256#3 +# asm 2: vmovupd 288(ee=%ymm2 +vmovupd 288(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 224(pp=%ymm1 +vmovupd 224(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 320 ] +# asm 1: vmovupd 320(ee=reg256#3 +# asm 2: vmovupd 320(ee=%ymm2 +vmovupd 320(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 256(pp=%ymm1 +vmovupd 256(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 352 ] +# asm 1: vmovupd 352(ee=reg256#3 +# asm 2: vmovupd 352(ee=%ymm2 +vmovupd 352(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 288(pp=%ymm1 +vmovupd 288(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 384 ] +# asm 1: vmovupd 384(ee=reg256#3 +# asm 2: vmovupd 384(ee=%ymm2 +vmovupd 384(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand buf=stack256#1 +# asm 2: vmovapd buf=0(%rsp) +vmovapd %ymm0,0(%rsp) + +# qhasm: s = *(uint64 *)(input_1 + 320) +# asm 1: movq 320(s=int64#6 +# asm 2: movq 320(s=%r9 +movq 320(%rsi),%r9 + +# qhasm: e = *(uint64 *)(input_2 + 416) +# asm 1: movq 416(e=int64#7 +# asm 2: movq 416(e=%rax +movq 416(%rdx),%rax + +# qhasm: s &= e +# asm 1: and p=int64#7 +# asm 2: movq 328(p=%rax +movq 328(%rsi),%rax + +# qhasm: e = *(uint64 *)(input_2 + 424) +# asm 1: movq 424(e=int64#8 +# asm 2: movq 424(e=%r10 +movq 424(%rdx),%r10 + +# qhasm: p &= e +# asm 1: and p=int64#7d +# asm 2: movl 336(p=%eax +movl 336(%rsi),%eax + +# qhasm: e = *(uint32 *)(input_2 + 432) +# asm 1: movl 432(e=int64#8d +# asm 2: movl 432(e=%r10d +movl 432(%rdx),%r10d + +# qhasm: p &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 16(b64=%rax +movq 16(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 24(b64=%rax +movq 24(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(ee=reg256#2 +# asm 2: vmovupd 0(ee=%ymm1 +vmovupd 0(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 32(ss=%ymm0 +vmovupd 32(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(ee=reg256#2 +# asm 2: vmovupd 32(ee=%ymm1 +vmovupd 32(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 64(ss=%ymm0 +vmovupd 64(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(ee=reg256#2 +# asm 2: vmovupd 64(ee=%ymm1 +vmovupd 64(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor + + +void PQCLEAN_MCELIECE348864_AVX_transpose_64x64(uint64_t *in); +void PQCLEAN_MCELIECE348864_AVX_transpose_64x256_sp(vec256 *in); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/transpose_64x256_sp_asm.S b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/transpose_64x256_sp_asm.S new file mode 100644 index 0000000000..1e2dabd7bb --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/transpose_64x256_sp_asm.S @@ -0,0 +1,8145 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 x0 + +# qhasm: reg256 x1 + +# qhasm: reg256 x2 + +# qhasm: reg256 x3 + +# qhasm: reg256 x4 + +# qhasm: reg256 x5 + +# qhasm: reg256 x6 + +# qhasm: reg256 x7 + +# qhasm: reg256 t0 + +# qhasm: reg256 t1 + +# qhasm: reg256 v00 + +# qhasm: reg256 v01 + +# qhasm: reg256 v10 + +# qhasm: reg256 v11 + +# qhasm: reg256 mask0 + +# qhasm: reg256 mask1 + +# qhasm: reg256 mask2 + +# qhasm: reg256 mask3 + +# qhasm: reg256 mask4 + +# qhasm: reg256 mask5 + +# qhasm: enter transpose_64x256_sp_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864_AVX_transpose_64x256_sp_asm +.global PQCLEAN_MCELIECE348864_AVX_transpose_64x256_sp_asm +_PQCLEAN_MCELIECE348864_AVX_transpose_64x256_sp_asm: +PQCLEAN_MCELIECE348864_AVX_transpose_64x256_sp_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: mask0 aligned= mem256[ PQCLEAN_MCELIECE348864_AVX_MASK5_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK5_0,>mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK5_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE348864_AVX_MASK5_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE348864_AVX_MASK5_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK5_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK5_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE348864_AVX_MASK5_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE348864_AVX_MASK4_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK4_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK4_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE348864_AVX_MASK4_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE348864_AVX_MASK4_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK4_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK4_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE348864_AVX_MASK4_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE348864_AVX_MASK3_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK3_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK3_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE348864_AVX_MASK3_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE348864_AVX_MASK3_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK3_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK3_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE348864_AVX_MASK3_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(x1=reg256#8 +# asm 2: vmovupd 256(x1=%ymm7 +vmovupd 256(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 512 ] +# asm 1: vmovupd 512(x2=reg256#9 +# asm 2: vmovupd 512(x2=%ymm8 +vmovupd 512(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 768 ] +# asm 1: vmovupd 768(x3=reg256#10 +# asm 2: vmovupd 768(x3=%ymm9 +vmovupd 768(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1024 ] +# asm 1: vmovupd 1024(x4=reg256#11 +# asm 2: vmovupd 1024(x4=%ymm10 +vmovupd 1024(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1280 ] +# asm 1: vmovupd 1280(x5=reg256#12 +# asm 2: vmovupd 1280(x5=%ymm11 +vmovupd 1280(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1536 ] +# asm 1: vmovupd 1536(x6=reg256#13 +# asm 2: vmovupd 1536(x6=%ymm12 +vmovupd 1536(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1792 ] +# asm 1: vmovupd 1792(x7=reg256#14 +# asm 2: vmovupd 1792(x7=%ymm13 +vmovupd 1792(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 32(x0=%ymm6 +vmovupd 32(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x2=reg256#9 +# asm 2: vmovupd 544(x2=%ymm8 +vmovupd 544(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x3=reg256#10 +# asm 2: vmovupd 800(x3=%ymm9 +vmovupd 800(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x4=reg256#11 +# asm 2: vmovupd 1056(x4=%ymm10 +vmovupd 1056(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x5=reg256#12 +# asm 2: vmovupd 1312(x5=%ymm11 +vmovupd 1312(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x6=reg256#13 +# asm 2: vmovupd 1568(x6=%ymm12 +vmovupd 1568(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x7=reg256#14 +# asm 2: vmovupd 1824(x7=%ymm13 +vmovupd 1824(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 32 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 64(x0=%ymm6 +vmovupd 64(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x1=reg256#8 +# asm 2: vmovupd 320(x1=%ymm7 +vmovupd 320(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x3=reg256#10 +# asm 2: vmovupd 832(x3=%ymm9 +vmovupd 832(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x4=reg256#11 +# asm 2: vmovupd 1088(x4=%ymm10 +vmovupd 1088(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x5=reg256#12 +# asm 2: vmovupd 1344(x5=%ymm11 +vmovupd 1344(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x6=reg256#13 +# asm 2: vmovupd 1600(x6=%ymm12 +vmovupd 1600(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x7=reg256#14 +# asm 2: vmovupd 1856(x7=%ymm13 +vmovupd 1856(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 64 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 96(x0=%ymm6 +vmovupd 96(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x1=reg256#8 +# asm 2: vmovupd 352(x1=%ymm7 +vmovupd 352(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x2=reg256#9 +# asm 2: vmovupd 608(x2=%ymm8 +vmovupd 608(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x4=reg256#11 +# asm 2: vmovupd 1120(x4=%ymm10 +vmovupd 1120(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x5=reg256#12 +# asm 2: vmovupd 1376(x5=%ymm11 +vmovupd 1376(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x6=reg256#13 +# asm 2: vmovupd 1632(x6=%ymm12 +vmovupd 1632(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x7=reg256#14 +# asm 2: vmovupd 1888(x7=%ymm13 +vmovupd 1888(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 96 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 128(x0=%ymm6 +vmovupd 128(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x1=reg256#8 +# asm 2: vmovupd 384(x1=%ymm7 +vmovupd 384(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x2=reg256#9 +# asm 2: vmovupd 640(x2=%ymm8 +vmovupd 640(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x3=reg256#10 +# asm 2: vmovupd 896(x3=%ymm9 +vmovupd 896(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x5=reg256#12 +# asm 2: vmovupd 1408(x5=%ymm11 +vmovupd 1408(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x6=reg256#13 +# asm 2: vmovupd 1664(x6=%ymm12 +vmovupd 1664(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x7=reg256#14 +# asm 2: vmovupd 1920(x7=%ymm13 +vmovupd 1920(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 128 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 160(x0=%ymm6 +vmovupd 160(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x1=reg256#8 +# asm 2: vmovupd 416(x1=%ymm7 +vmovupd 416(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x2=reg256#9 +# asm 2: vmovupd 672(x2=%ymm8 +vmovupd 672(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x3=reg256#10 +# asm 2: vmovupd 928(x3=%ymm9 +vmovupd 928(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x4=reg256#11 +# asm 2: vmovupd 1184(x4=%ymm10 +vmovupd 1184(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x6=reg256#13 +# asm 2: vmovupd 1696(x6=%ymm12 +vmovupd 1696(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x7=reg256#14 +# asm 2: vmovupd 1952(x7=%ymm13 +vmovupd 1952(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 160 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 192(x0=%ymm6 +vmovupd 192(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x1=reg256#8 +# asm 2: vmovupd 448(x1=%ymm7 +vmovupd 448(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x2=reg256#9 +# asm 2: vmovupd 704(x2=%ymm8 +vmovupd 704(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x3=reg256#10 +# asm 2: vmovupd 960(x3=%ymm9 +vmovupd 960(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x4=reg256#11 +# asm 2: vmovupd 1216(x4=%ymm10 +vmovupd 1216(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x5=reg256#12 +# asm 2: vmovupd 1472(x5=%ymm11 +vmovupd 1472(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x7=reg256#14 +# asm 2: vmovupd 1984(x7=%ymm13 +vmovupd 1984(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 192 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 224(x0=%ymm6 +vmovupd 224(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x1=reg256#8 +# asm 2: vmovupd 480(x1=%ymm7 +vmovupd 480(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x2=reg256#9 +# asm 2: vmovupd 736(x2=%ymm8 +vmovupd 736(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x3=reg256#10 +# asm 2: vmovupd 992(x3=%ymm9 +vmovupd 992(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x4=reg256#11 +# asm 2: vmovupd 1248(x4=%ymm10 +vmovupd 1248(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x5=reg256#12 +# asm 2: vmovupd 1504(x5=%ymm11 +vmovupd 1504(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x6=reg256#13 +# asm 2: vmovupd 1760(x6=%ymm12 +vmovupd 1760(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#1 +# asm 2: vpand v00=%ymm0 +vpand %ymm9,%ymm0,%ymm0 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#13 +# asm 2: vpsllq $32,v10=%ymm12 +vpsllq $32,%ymm13,%ymm12 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm0,%ymm12,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#13 +# asm 2: vpslld $16,v10=%ymm12 +vpslld $16,%ymm11,%ymm12 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#14 +# asm 2: vpsrld $16,v01=%ymm13 +vpsrld $16,%ymm14,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm0,%ymm13 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm8,%ymm13 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#3 +# asm 2: vpand v00=%ymm2 +vpand %ymm7,%ymm2,%ymm2 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#9 +# asm 2: vpslld $16,v10=%ymm8 +vpslld $16,%ymm1,%ymm8 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm2,%ymm8,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#8 +# asm 2: vpsllw $8,v10=%ymm7 +vpsllw $8,%ymm12,%ymm7 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#9 +# asm 2: vpsrlw $8,v01=%ymm8 +vpsrlw $8,%ymm9,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#10 +# asm 2: vpsllw $8,v10=%ymm9 +vpsllw $8,%ymm0,%ymm9 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#12 +# asm 2: vpsllw $8,v10=%ymm11 +vpsllw $8,%ymm2,%ymm11 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#11 +# asm 2: vpsrlw $8,v01=%ymm10 +vpsrlw $8,%ymm10,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#5 +# asm 2: vpand v00=%ymm4 +vpand %ymm6,%ymm4,%ymm4 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#11 +# asm 2: vpsllw $8,v10=%ymm10 +vpsllw $8,%ymm1,%ymm10 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm4,%ymm10,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 224 ] = x0 +# asm 1: vmovupd mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK2_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE348864_AVX_MASK2_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE348864_AVX_MASK2_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK2_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK2_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE348864_AVX_MASK2_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE348864_AVX_MASK1_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK1_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK1_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE348864_AVX_MASK1_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE348864_AVX_MASK1_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK1_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK1_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE348864_AVX_MASK1_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE348864_AVX_MASK0_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK0_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK0_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE348864_AVX_MASK0_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE348864_AVX_MASK0_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK0_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE348864_AVX_MASK0_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE348864_AVX_MASK0_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(x1=reg256#8 +# asm 2: vmovupd 32(x1=%ymm7 +vmovupd 32(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(x2=reg256#9 +# asm 2: vmovupd 64(x2=%ymm8 +vmovupd 64(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(x3=reg256#10 +# asm 2: vmovupd 96(x3=%ymm9 +vmovupd 96(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(x4=reg256#11 +# asm 2: vmovupd 128(x4=%ymm10 +vmovupd 128(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(x5=reg256#12 +# asm 2: vmovupd 160(x5=%ymm11 +vmovupd 160(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(x6=reg256#13 +# asm 2: vmovupd 192(x6=%ymm12 +vmovupd 192(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(x7=reg256#14 +# asm 2: vmovupd 224(x7=%ymm13 +vmovupd 224(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 256(x0=%ymm6 +vmovupd 256(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x2=reg256#9 +# asm 2: vmovupd 320(x2=%ymm8 +vmovupd 320(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x3=reg256#10 +# asm 2: vmovupd 352(x3=%ymm9 +vmovupd 352(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x4=reg256#11 +# asm 2: vmovupd 384(x4=%ymm10 +vmovupd 384(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x5=reg256#12 +# asm 2: vmovupd 416(x5=%ymm11 +vmovupd 416(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x6=reg256#13 +# asm 2: vmovupd 448(x6=%ymm12 +vmovupd 448(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x7=reg256#14 +# asm 2: vmovupd 480(x7=%ymm13 +vmovupd 480(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 256 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 512(x0=%ymm6 +vmovupd 512(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x1=reg256#8 +# asm 2: vmovupd 544(x1=%ymm7 +vmovupd 544(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x3=reg256#10 +# asm 2: vmovupd 608(x3=%ymm9 +vmovupd 608(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x4=reg256#11 +# asm 2: vmovupd 640(x4=%ymm10 +vmovupd 640(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x5=reg256#12 +# asm 2: vmovupd 672(x5=%ymm11 +vmovupd 672(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x6=reg256#13 +# asm 2: vmovupd 704(x6=%ymm12 +vmovupd 704(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x7=reg256#14 +# asm 2: vmovupd 736(x7=%ymm13 +vmovupd 736(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 512 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 768(x0=%ymm6 +vmovupd 768(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x1=reg256#8 +# asm 2: vmovupd 800(x1=%ymm7 +vmovupd 800(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x2=reg256#9 +# asm 2: vmovupd 832(x2=%ymm8 +vmovupd 832(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x4=reg256#11 +# asm 2: vmovupd 896(x4=%ymm10 +vmovupd 896(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x5=reg256#12 +# asm 2: vmovupd 928(x5=%ymm11 +vmovupd 928(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x6=reg256#13 +# asm 2: vmovupd 960(x6=%ymm12 +vmovupd 960(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x7=reg256#14 +# asm 2: vmovupd 992(x7=%ymm13 +vmovupd 992(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 768 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1024(x0=%ymm6 +vmovupd 1024(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x1=reg256#8 +# asm 2: vmovupd 1056(x1=%ymm7 +vmovupd 1056(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x2=reg256#9 +# asm 2: vmovupd 1088(x2=%ymm8 +vmovupd 1088(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x3=reg256#10 +# asm 2: vmovupd 1120(x3=%ymm9 +vmovupd 1120(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x5=reg256#12 +# asm 2: vmovupd 1184(x5=%ymm11 +vmovupd 1184(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x6=reg256#13 +# asm 2: vmovupd 1216(x6=%ymm12 +vmovupd 1216(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x7=reg256#14 +# asm 2: vmovupd 1248(x7=%ymm13 +vmovupd 1248(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1024 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1280(x0=%ymm6 +vmovupd 1280(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x1=reg256#8 +# asm 2: vmovupd 1312(x1=%ymm7 +vmovupd 1312(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x2=reg256#9 +# asm 2: vmovupd 1344(x2=%ymm8 +vmovupd 1344(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x3=reg256#10 +# asm 2: vmovupd 1376(x3=%ymm9 +vmovupd 1376(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x4=reg256#11 +# asm 2: vmovupd 1408(x4=%ymm10 +vmovupd 1408(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x6=reg256#13 +# asm 2: vmovupd 1472(x6=%ymm12 +vmovupd 1472(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x7=reg256#14 +# asm 2: vmovupd 1504(x7=%ymm13 +vmovupd 1504(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1280 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1536(x0=%ymm6 +vmovupd 1536(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x1=reg256#8 +# asm 2: vmovupd 1568(x1=%ymm7 +vmovupd 1568(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x2=reg256#9 +# asm 2: vmovupd 1600(x2=%ymm8 +vmovupd 1600(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x3=reg256#10 +# asm 2: vmovupd 1632(x3=%ymm9 +vmovupd 1632(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x4=reg256#11 +# asm 2: vmovupd 1664(x4=%ymm10 +vmovupd 1664(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x5=reg256#12 +# asm 2: vmovupd 1696(x5=%ymm11 +vmovupd 1696(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x7=reg256#14 +# asm 2: vmovupd 1760(x7=%ymm13 +vmovupd 1760(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1536 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1792(x0=%ymm6 +vmovupd 1792(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x1=reg256#8 +# asm 2: vmovupd 1824(x1=%ymm7 +vmovupd 1824(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x2=reg256#9 +# asm 2: vmovupd 1856(x2=%ymm8 +vmovupd 1856(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x3=reg256#10 +# asm 2: vmovupd 1888(x3=%ymm9 +vmovupd 1888(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x4=reg256#11 +# asm 2: vmovupd 1920(x4=%ymm10 +vmovupd 1920(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x5=reg256#12 +# asm 2: vmovupd 1952(x5=%ymm11 +vmovupd 1952(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x6=reg256#13 +# asm 2: vmovupd 1984(x6=%ymm12 +vmovupd 1984(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#1 +# asm 2: vpand v10=%ymm0 +vpand %ymm13,%ymm0,%ymm0 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm12,%ymm0,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#13 +# asm 2: vpand v10=%ymm12 +vpand %ymm11,%ymm2,%ymm12 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm14,%ymm3,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm0,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm8,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#3 +# asm 2: vpand v10=%ymm2 +vpand %ymm1,%ymm2,%ymm2 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm8,%ymm2,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#8 +# asm 2: vpand v10=%ymm7 +vpand %ymm12,%ymm4,%ymm7 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm9,%ymm5,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#10 +# asm 2: vpand v10=%ymm9 +vpand %ymm0,%ymm4,%ymm9 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#12 +# asm 2: vpand v10=%ymm11 +vpand %ymm2,%ymm4,%ymm11 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm4,%ymm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#5 +# asm 2: vpand v10=%ymm4 +vpand %ymm1,%ymm4,%ymm4 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm10,%ymm4,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 1792 ] = x0 +# asm 1: vmovupd mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE348864_AVX_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE348864_AVX_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE348864_AVX_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_AVX_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE348864_AVX_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE348864_AVX_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE348864_AVX_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_AVX_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE348864_AVX_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE348864_AVX_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE348864_AVX_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_AVX_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE348864_AVX_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE348864_AVX_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE348864_AVX_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_AVX_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE348864_AVX_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE348864_AVX_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE348864_AVX_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_AVX_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE348864_AVX_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE348864_AVX_MASK3_1(%rip),%xmm5 + +# qhasm: r0 = mem64[ input_0 + 0 ] x2 +# asm 1: movddup 0(r0=reg128#7 +# asm 2: movddup 0(r0=%xmm6 +movddup 0(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 64 ] x2 +# asm 1: movddup 64(r1=reg128#8 +# asm 2: movddup 64(r1=%xmm7 +movddup 64(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 128 ] x2 +# asm 1: movddup 128(r2=reg128#9 +# asm 2: movddup 128(r2=%xmm8 +movddup 128(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 192 ] x2 +# asm 1: movddup 192(r3=reg128#10 +# asm 2: movddup 192(r3=%xmm9 +movddup 192(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 256 ] x2 +# asm 1: movddup 256(r4=reg128#11 +# asm 2: movddup 256(r4=%xmm10 +movddup 256(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 320 ] x2 +# asm 1: movddup 320(r5=reg128#12 +# asm 2: movddup 320(r5=%xmm11 +movddup 320(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 384 ] x2 +# asm 1: movddup 384(r6=reg128#13 +# asm 2: movddup 384(r6=%xmm12 +movddup 384(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 448 ] x2 +# asm 1: movddup 448(r7=reg128#14 +# asm 2: movddup 448(r7=%xmm13 +movddup 448(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 0 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 64 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 128 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 192 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 256 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 320 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 384 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 448 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 8(r0=%xmm6 +movddup 8(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 72 ] x2 +# asm 1: movddup 72(r1=reg128#8 +# asm 2: movddup 72(r1=%xmm7 +movddup 72(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 136 ] x2 +# asm 1: movddup 136(r2=reg128#9 +# asm 2: movddup 136(r2=%xmm8 +movddup 136(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 200 ] x2 +# asm 1: movddup 200(r3=reg128#10 +# asm 2: movddup 200(r3=%xmm9 +movddup 200(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 264 ] x2 +# asm 1: movddup 264(r4=reg128#11 +# asm 2: movddup 264(r4=%xmm10 +movddup 264(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 328 ] x2 +# asm 1: movddup 328(r5=reg128#12 +# asm 2: movddup 328(r5=%xmm11 +movddup 328(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 392 ] x2 +# asm 1: movddup 392(r6=reg128#13 +# asm 2: movddup 392(r6=%xmm12 +movddup 392(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 456 ] x2 +# asm 1: movddup 456(r7=reg128#14 +# asm 2: movddup 456(r7=%xmm13 +movddup 456(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 8 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 72 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 136 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 200 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 264 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 328 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 392 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 456 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 16(r0=%xmm6 +movddup 16(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 80 ] x2 +# asm 1: movddup 80(r1=reg128#8 +# asm 2: movddup 80(r1=%xmm7 +movddup 80(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 144 ] x2 +# asm 1: movddup 144(r2=reg128#9 +# asm 2: movddup 144(r2=%xmm8 +movddup 144(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 208 ] x2 +# asm 1: movddup 208(r3=reg128#10 +# asm 2: movddup 208(r3=%xmm9 +movddup 208(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 272 ] x2 +# asm 1: movddup 272(r4=reg128#11 +# asm 2: movddup 272(r4=%xmm10 +movddup 272(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 336 ] x2 +# asm 1: movddup 336(r5=reg128#12 +# asm 2: movddup 336(r5=%xmm11 +movddup 336(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 400 ] x2 +# asm 1: movddup 400(r6=reg128#13 +# asm 2: movddup 400(r6=%xmm12 +movddup 400(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 464 ] x2 +# asm 1: movddup 464(r7=reg128#14 +# asm 2: movddup 464(r7=%xmm13 +movddup 464(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 16 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 80 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 144 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 208 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 272 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 336 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 400 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 464 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 24(r0=%xmm6 +movddup 24(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 88 ] x2 +# asm 1: movddup 88(r1=reg128#8 +# asm 2: movddup 88(r1=%xmm7 +movddup 88(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 152 ] x2 +# asm 1: movddup 152(r2=reg128#9 +# asm 2: movddup 152(r2=%xmm8 +movddup 152(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 216 ] x2 +# asm 1: movddup 216(r3=reg128#10 +# asm 2: movddup 216(r3=%xmm9 +movddup 216(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 280 ] x2 +# asm 1: movddup 280(r4=reg128#11 +# asm 2: movddup 280(r4=%xmm10 +movddup 280(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 344 ] x2 +# asm 1: movddup 344(r5=reg128#12 +# asm 2: movddup 344(r5=%xmm11 +movddup 344(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 408 ] x2 +# asm 1: movddup 408(r6=reg128#13 +# asm 2: movddup 408(r6=%xmm12 +movddup 408(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 472 ] x2 +# asm 1: movddup 472(r7=reg128#14 +# asm 2: movddup 472(r7=%xmm13 +movddup 472(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 24 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 88 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 152 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 216 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 280 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 344 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 408 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 472 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 32(r0=%xmm6 +movddup 32(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 96 ] x2 +# asm 1: movddup 96(r1=reg128#8 +# asm 2: movddup 96(r1=%xmm7 +movddup 96(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 160 ] x2 +# asm 1: movddup 160(r2=reg128#9 +# asm 2: movddup 160(r2=%xmm8 +movddup 160(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 224 ] x2 +# asm 1: movddup 224(r3=reg128#10 +# asm 2: movddup 224(r3=%xmm9 +movddup 224(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 288 ] x2 +# asm 1: movddup 288(r4=reg128#11 +# asm 2: movddup 288(r4=%xmm10 +movddup 288(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 352 ] x2 +# asm 1: movddup 352(r5=reg128#12 +# asm 2: movddup 352(r5=%xmm11 +movddup 352(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 416 ] x2 +# asm 1: movddup 416(r6=reg128#13 +# asm 2: movddup 416(r6=%xmm12 +movddup 416(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 480 ] x2 +# asm 1: movddup 480(r7=reg128#14 +# asm 2: movddup 480(r7=%xmm13 +movddup 480(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 32 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 96 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 160 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 224 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 288 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 352 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 416 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 480 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 40(r0=%xmm6 +movddup 40(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 104 ] x2 +# asm 1: movddup 104(r1=reg128#8 +# asm 2: movddup 104(r1=%xmm7 +movddup 104(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 168 ] x2 +# asm 1: movddup 168(r2=reg128#9 +# asm 2: movddup 168(r2=%xmm8 +movddup 168(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 232 ] x2 +# asm 1: movddup 232(r3=reg128#10 +# asm 2: movddup 232(r3=%xmm9 +movddup 232(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 296 ] x2 +# asm 1: movddup 296(r4=reg128#11 +# asm 2: movddup 296(r4=%xmm10 +movddup 296(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 360 ] x2 +# asm 1: movddup 360(r5=reg128#12 +# asm 2: movddup 360(r5=%xmm11 +movddup 360(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 424 ] x2 +# asm 1: movddup 424(r6=reg128#13 +# asm 2: movddup 424(r6=%xmm12 +movddup 424(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 488 ] x2 +# asm 1: movddup 488(r7=reg128#14 +# asm 2: movddup 488(r7=%xmm13 +movddup 488(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 40 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 104 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 168 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 232 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 296 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 360 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 424 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 488 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 48(r0=%xmm6 +movddup 48(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 112 ] x2 +# asm 1: movddup 112(r1=reg128#8 +# asm 2: movddup 112(r1=%xmm7 +movddup 112(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 176 ] x2 +# asm 1: movddup 176(r2=reg128#9 +# asm 2: movddup 176(r2=%xmm8 +movddup 176(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 240 ] x2 +# asm 1: movddup 240(r3=reg128#10 +# asm 2: movddup 240(r3=%xmm9 +movddup 240(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 304 ] x2 +# asm 1: movddup 304(r4=reg128#11 +# asm 2: movddup 304(r4=%xmm10 +movddup 304(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 368 ] x2 +# asm 1: movddup 368(r5=reg128#12 +# asm 2: movddup 368(r5=%xmm11 +movddup 368(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 432 ] x2 +# asm 1: movddup 432(r6=reg128#13 +# asm 2: movddup 432(r6=%xmm12 +movddup 432(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 496 ] x2 +# asm 1: movddup 496(r7=reg128#14 +# asm 2: movddup 496(r7=%xmm13 +movddup 496(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 48 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 112 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 176 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 240 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 304 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 368 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 432 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 496 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 56(r0=%xmm6 +movddup 56(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 120 ] x2 +# asm 1: movddup 120(r1=reg128#8 +# asm 2: movddup 120(r1=%xmm7 +movddup 120(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 184 ] x2 +# asm 1: movddup 184(r2=reg128#9 +# asm 2: movddup 184(r2=%xmm8 +movddup 184(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 248 ] x2 +# asm 1: movddup 248(r3=reg128#10 +# asm 2: movddup 248(r3=%xmm9 +movddup 248(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 312 ] x2 +# asm 1: movddup 312(r4=reg128#11 +# asm 2: movddup 312(r4=%xmm10 +movddup 312(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 376 ] x2 +# asm 1: movddup 376(r5=reg128#12 +# asm 2: movddup 376(r5=%xmm11 +movddup 376(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 440 ] x2 +# asm 1: movddup 440(r6=reg128#13 +# asm 2: movddup 440(r6=%xmm12 +movddup 440(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 504 ] x2 +# asm 1: movddup 504(r7=reg128#14 +# asm 2: movddup 504(r7=%xmm13 +movddup 504(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#13 +# asm 2: vpor r1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#11 +# asm 2: vpor r4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#4 +# asm 2: vpor r0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#8 +# asm 2: vpor r1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#9 +# asm 2: vpor r2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#10 +# asm 2: vpor r4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#5 +# asm 2: vpor r6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm3,%rsi + +# qhasm: mem64[ input_0 + 56 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm7,%rsi + +# qhasm: mem64[ input_0 + 120 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 184 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm0,%rsi + +# qhasm: mem64[ input_0 + 248 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 312 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm2,%rsi + +# qhasm: mem64[ input_0 + 376 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm4,%rsi + +# qhasm: mem64[ input_0 + 440 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm1,%rsi + +# qhasm: mem64[ input_0 + 504 ] = buf +# asm 1: movq mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE348864_AVX_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE348864_AVX_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE348864_AVX_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_AVX_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE348864_AVX_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE348864_AVX_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE348864_AVX_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_AVX_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE348864_AVX_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE348864_AVX_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE348864_AVX_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_AVX_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE348864_AVX_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE348864_AVX_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE348864_AVX_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_AVX_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE348864_AVX_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE348864_AVX_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE348864_AVX_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_AVX_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE348864_AVX_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE348864_AVX_MASK0_1(%rip),%xmm5 + +# qhasm: r0 = mem64[ input_0 + 0 ] x2 +# asm 1: movddup 0(r0=reg128#7 +# asm 2: movddup 0(r0=%xmm6 +movddup 0(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 8 ] x2 +# asm 1: movddup 8(r1=reg128#8 +# asm 2: movddup 8(r1=%xmm7 +movddup 8(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 16 ] x2 +# asm 1: movddup 16(r2=reg128#9 +# asm 2: movddup 16(r2=%xmm8 +movddup 16(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 24 ] x2 +# asm 1: movddup 24(r3=reg128#10 +# asm 2: movddup 24(r3=%xmm9 +movddup 24(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 32 ] x2 +# asm 1: movddup 32(r4=reg128#11 +# asm 2: movddup 32(r4=%xmm10 +movddup 32(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 40 ] x2 +# asm 1: movddup 40(r5=reg128#12 +# asm 2: movddup 40(r5=%xmm11 +movddup 40(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 48 ] x2 +# asm 1: movddup 48(r6=reg128#13 +# asm 2: movddup 48(r6=%xmm12 +movddup 48(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 56 ] x2 +# asm 1: movddup 56(r7=reg128#14 +# asm 2: movddup 56(r7=%xmm13 +movddup 56(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 0 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 16 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 32 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 64(r0=%xmm6 +movddup 64(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 72 ] x2 +# asm 1: movddup 72(r1=reg128#8 +# asm 2: movddup 72(r1=%xmm7 +movddup 72(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 80 ] x2 +# asm 1: movddup 80(r2=reg128#9 +# asm 2: movddup 80(r2=%xmm8 +movddup 80(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 88 ] x2 +# asm 1: movddup 88(r3=reg128#10 +# asm 2: movddup 88(r3=%xmm9 +movddup 88(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 96 ] x2 +# asm 1: movddup 96(r4=reg128#11 +# asm 2: movddup 96(r4=%xmm10 +movddup 96(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 104 ] x2 +# asm 1: movddup 104(r5=reg128#12 +# asm 2: movddup 104(r5=%xmm11 +movddup 104(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 112 ] x2 +# asm 1: movddup 112(r6=reg128#13 +# asm 2: movddup 112(r6=%xmm12 +movddup 112(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 120 ] x2 +# asm 1: movddup 120(r7=reg128#14 +# asm 2: movddup 120(r7=%xmm13 +movddup 120(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 64 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 80 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 96 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 112 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 128(r0=%xmm6 +movddup 128(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 136 ] x2 +# asm 1: movddup 136(r1=reg128#8 +# asm 2: movddup 136(r1=%xmm7 +movddup 136(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 144 ] x2 +# asm 1: movddup 144(r2=reg128#9 +# asm 2: movddup 144(r2=%xmm8 +movddup 144(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 152 ] x2 +# asm 1: movddup 152(r3=reg128#10 +# asm 2: movddup 152(r3=%xmm9 +movddup 152(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 160 ] x2 +# asm 1: movddup 160(r4=reg128#11 +# asm 2: movddup 160(r4=%xmm10 +movddup 160(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 168 ] x2 +# asm 1: movddup 168(r5=reg128#12 +# asm 2: movddup 168(r5=%xmm11 +movddup 168(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 176 ] x2 +# asm 1: movddup 176(r6=reg128#13 +# asm 2: movddup 176(r6=%xmm12 +movddup 176(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 184 ] x2 +# asm 1: movddup 184(r7=reg128#14 +# asm 2: movddup 184(r7=%xmm13 +movddup 184(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 128 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 144 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 160 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 176 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 192(r0=%xmm6 +movddup 192(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 200 ] x2 +# asm 1: movddup 200(r1=reg128#8 +# asm 2: movddup 200(r1=%xmm7 +movddup 200(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 208 ] x2 +# asm 1: movddup 208(r2=reg128#9 +# asm 2: movddup 208(r2=%xmm8 +movddup 208(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 216 ] x2 +# asm 1: movddup 216(r3=reg128#10 +# asm 2: movddup 216(r3=%xmm9 +movddup 216(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 224 ] x2 +# asm 1: movddup 224(r4=reg128#11 +# asm 2: movddup 224(r4=%xmm10 +movddup 224(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 232 ] x2 +# asm 1: movddup 232(r5=reg128#12 +# asm 2: movddup 232(r5=%xmm11 +movddup 232(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 240 ] x2 +# asm 1: movddup 240(r6=reg128#13 +# asm 2: movddup 240(r6=%xmm12 +movddup 240(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 248 ] x2 +# asm 1: movddup 248(r7=reg128#14 +# asm 2: movddup 248(r7=%xmm13 +movddup 248(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 192 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 208 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 224 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 240 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 256(r0=%xmm6 +movddup 256(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 264 ] x2 +# asm 1: movddup 264(r1=reg128#8 +# asm 2: movddup 264(r1=%xmm7 +movddup 264(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 272 ] x2 +# asm 1: movddup 272(r2=reg128#9 +# asm 2: movddup 272(r2=%xmm8 +movddup 272(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 280 ] x2 +# asm 1: movddup 280(r3=reg128#10 +# asm 2: movddup 280(r3=%xmm9 +movddup 280(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 288 ] x2 +# asm 1: movddup 288(r4=reg128#11 +# asm 2: movddup 288(r4=%xmm10 +movddup 288(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 296 ] x2 +# asm 1: movddup 296(r5=reg128#12 +# asm 2: movddup 296(r5=%xmm11 +movddup 296(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 304 ] x2 +# asm 1: movddup 304(r6=reg128#13 +# asm 2: movddup 304(r6=%xmm12 +movddup 304(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 312 ] x2 +# asm 1: movddup 312(r7=reg128#14 +# asm 2: movddup 312(r7=%xmm13 +movddup 312(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 256 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 272 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 288 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 304 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 320(r0=%xmm6 +movddup 320(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 328 ] x2 +# asm 1: movddup 328(r1=reg128#8 +# asm 2: movddup 328(r1=%xmm7 +movddup 328(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 336 ] x2 +# asm 1: movddup 336(r2=reg128#9 +# asm 2: movddup 336(r2=%xmm8 +movddup 336(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 344 ] x2 +# asm 1: movddup 344(r3=reg128#10 +# asm 2: movddup 344(r3=%xmm9 +movddup 344(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 352 ] x2 +# asm 1: movddup 352(r4=reg128#11 +# asm 2: movddup 352(r4=%xmm10 +movddup 352(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 360 ] x2 +# asm 1: movddup 360(r5=reg128#12 +# asm 2: movddup 360(r5=%xmm11 +movddup 360(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 368 ] x2 +# asm 1: movddup 368(r6=reg128#13 +# asm 2: movddup 368(r6=%xmm12 +movddup 368(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 376 ] x2 +# asm 1: movddup 376(r7=reg128#14 +# asm 2: movddup 376(r7=%xmm13 +movddup 376(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 320 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 336 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 352 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 368 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 384(r0=%xmm6 +movddup 384(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 392 ] x2 +# asm 1: movddup 392(r1=reg128#8 +# asm 2: movddup 392(r1=%xmm7 +movddup 392(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 400 ] x2 +# asm 1: movddup 400(r2=reg128#9 +# asm 2: movddup 400(r2=%xmm8 +movddup 400(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 408 ] x2 +# asm 1: movddup 408(r3=reg128#10 +# asm 2: movddup 408(r3=%xmm9 +movddup 408(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 416 ] x2 +# asm 1: movddup 416(r4=reg128#11 +# asm 2: movddup 416(r4=%xmm10 +movddup 416(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 424 ] x2 +# asm 1: movddup 424(r5=reg128#12 +# asm 2: movddup 424(r5=%xmm11 +movddup 424(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 432 ] x2 +# asm 1: movddup 432(r6=reg128#13 +# asm 2: movddup 432(r6=%xmm12 +movddup 432(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 440 ] x2 +# asm 1: movddup 440(r7=reg128#14 +# asm 2: movddup 440(r7=%xmm13 +movddup 440(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 384 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 400 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 416 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 432 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 448(r0=%xmm6 +movddup 448(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 456 ] x2 +# asm 1: movddup 456(r1=reg128#8 +# asm 2: movddup 456(r1=%xmm7 +movddup 456(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 464 ] x2 +# asm 1: movddup 464(r2=reg128#9 +# asm 2: movddup 464(r2=%xmm8 +movddup 464(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 472 ] x2 +# asm 1: movddup 472(r3=reg128#10 +# asm 2: movddup 472(r3=%xmm9 +movddup 472(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 480 ] x2 +# asm 1: movddup 480(r4=reg128#11 +# asm 2: movddup 480(r4=%xmm10 +movddup 480(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 488 ] x2 +# asm 1: movddup 488(r5=reg128#12 +# asm 2: movddup 488(r5=%xmm11 +movddup 488(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 496 ] x2 +# asm 1: movddup 496(r6=reg128#13 +# asm 2: movddup 496(r6=%xmm12 +movddup 496(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 504 ] x2 +# asm 1: movddup 504(r7=reg128#14 +# asm 2: movddup 504(r7=%xmm13 +movddup 504(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#13 +# asm 2: vpor r1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#11 +# asm 2: vpor r4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#4 +# asm 2: vpor r0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#8 +# asm 2: vpor r1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#9 +# asm 2: vpor r2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#10 +# asm 2: vpor r4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#5 +# asm 2: vpor r6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#4 +# asm 2: vpunpcklqdq t0=%xmm3 +vpunpcklqdq %xmm7,%xmm3,%xmm3 + +# qhasm: mem128[ input_0 + 448 ] = t0 +# asm 1: movdqu t0=reg128#1 +# asm 2: vpunpcklqdq t0=%xmm0 +vpunpcklqdq %xmm0,%xmm8,%xmm0 + +# qhasm: mem128[ input_0 + 464 ] = t0 +# asm 1: movdqu t0=reg128#1 +# asm 2: vpunpcklqdq t0=%xmm0 +vpunpcklqdq %xmm2,%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 480 ] = t0 +# asm 1: movdqu t0=reg128#1 +# asm 2: vpunpcklqdq t0=%xmm0 +vpunpcklqdq %xmm1,%xmm4,%xmm0 + +# qhasm: mem128[ input_0 + 496 ] = t0 +# asm 1: movdqu +#include + +void PQCLEAN_MCELIECE348864_AVX_uint32_sort(uint32_t *x, size_t n); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/update_asm.S b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/update_asm.S new file mode 100644 index 0000000000..cecfdbcb95 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/update_asm.S @@ -0,0 +1,354 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 s0 + +# qhasm: int64 s1 + +# qhasm: enter update_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864_AVX_update_asm +.global PQCLEAN_MCELIECE348864_AVX_update_asm +_PQCLEAN_MCELIECE348864_AVX_update_asm: +PQCLEAN_MCELIECE348864_AVX_update_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: s1 = input_1 +# asm 1: mov s1=int64#2 +# asm 2: mov s1=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE348864_AVX_store2(unsigned char *dest, gf a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE348864_AVX_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE348864_AVX_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE348864_AVX_irr_load(uint64_t *out, const unsigned char *in) { + int i, j; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE348864_AVX_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + out[i] = 0; + } + + for (i = SYS_T; i >= 0; i--) { + for (j = 0; j < GFBITS; j++) { + out[j] <<= 1; + out[j] |= (irr[i] >> j) & 1; + } + } +} + +void PQCLEAN_MCELIECE348864_AVX_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE348864_AVX_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +gf PQCLEAN_MCELIECE348864_AVX_bitrev(gf a) { + a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8); + a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4); + a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2); + a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1); + + return a >> 4; +} + +vec128 PQCLEAN_MCELIECE348864_AVX_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE348864_AVX_vec128_set2x( PQCLEAN_MCELIECE348864_AVX_load8(in), PQCLEAN_MCELIECE348864_AVX_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE348864_AVX_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE348864_AVX_store8(out + 0, PQCLEAN_MCELIECE348864_AVX_vec128_extract(in, 0)); + PQCLEAN_MCELIECE348864_AVX_store8(out + 8, PQCLEAN_MCELIECE348864_AVX_vec128_extract(in, 1)); +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/util.h b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/util.h new file mode 100644 index 0000000000..1367edf4fe --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/util.h @@ -0,0 +1,33 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_UTIL_H +#define PQCLEAN_MCELIECE348864_AVX_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "gf.h" +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE348864_AVX_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE348864_AVX_store2(unsigned char *dest, gf a); + +uint16_t PQCLEAN_MCELIECE348864_AVX_load2(const unsigned char *src); + +uint32_t PQCLEAN_MCELIECE348864_AVX_load4(const unsigned char *src); + +void PQCLEAN_MCELIECE348864_AVX_irr_load(uint64_t *out, const unsigned char *in); + +void PQCLEAN_MCELIECE348864_AVX_store8(unsigned char *out, uint64_t in); + +uint64_t PQCLEAN_MCELIECE348864_AVX_load8(const unsigned char *in); + +gf PQCLEAN_MCELIECE348864_AVX_bitrev(gf a); + +vec128 PQCLEAN_MCELIECE348864_AVX_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE348864_AVX_store16(unsigned char *out, vec128 in); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec.c b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec.c new file mode 100644 index 0000000000..6836a151db --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec.c @@ -0,0 +1,25 @@ +#include "vec.h" + +#include "params.h" + +extern void PQCLEAN_MCELIECE348864_AVX_vec_mul_asm(uint64_t *, const uint64_t *, const uint64_t *); +extern void PQCLEAN_MCELIECE348864_AVX_vec_mul_sp_asm(uint64_t *, const uint64_t *, const uint64_t *); + + +void PQCLEAN_MCELIECE348864_AVX_vec_mul(uint64_t *h, const uint64_t *f, const uint64_t *g) { + PQCLEAN_MCELIECE348864_AVX_vec_mul_asm(h, f, g); +} + + +void PQCLEAN_MCELIECE348864_AVX_vec_mul_sp(uint64_t *h, const uint64_t *f, const uint64_t *g) { + PQCLEAN_MCELIECE348864_AVX_vec_mul_sp_asm(h, f, g); +} + +void PQCLEAN_MCELIECE348864_AVX_vec_add(uint64_t *h, const uint64_t *f, const uint64_t *g) { + int b; + + for (b = 0; b < GFBITS; b++) { + h[b] = f[b] ^ g[b]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec.h b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec.h new file mode 100644 index 0000000000..cbe6beb6bc --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_VEC_H +#define PQCLEAN_MCELIECE348864_AVX_VEC_H + +#include + + +void PQCLEAN_MCELIECE348864_AVX_vec_mul(uint64_t *h, const uint64_t *f, const uint64_t *g); + +void PQCLEAN_MCELIECE348864_AVX_vec_mul_sp(uint64_t *h, const uint64_t *f, const uint64_t *g); + +void PQCLEAN_MCELIECE348864_AVX_vec_add(uint64_t *h, const uint64_t *f, const uint64_t *g); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec128.c b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec128.c new file mode 100644 index 0000000000..e40f289775 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec128.c @@ -0,0 +1,83 @@ +#include "vec128.h" + +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +int PQCLEAN_MCELIECE348864_AVX_vec128_testz(vec128 a) { + return _mm_testz_si128(a, a); +} + +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE348864_AVX_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE348864_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE348864_AVX_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE348864_AVX_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE348864_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE348864_AVX_vec128_mul_asm(h, f, g, 16); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec128.h b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec128.h new file mode 100644 index 0000000000..69c9be1c3a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec128.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_MCELIECE348864_AVX_VEC128_H +#define PQCLEAN_MCELIECE348864_AVX_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE348864_AVX_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE348864_AVX_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE348864_AVX_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE348864_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE348864_AVX_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE348864_AVX_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE348864_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec128_mul_asm.S b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec128_mul_asm.S new file mode 100644 index 0000000000..de29ef1cc3 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec128_mul_asm.S @@ -0,0 +1,1369 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 b2 + +# qhasm: reg256 b3 + +# qhasm: reg256 b4 + +# qhasm: reg256 b5 + +# qhasm: reg256 b6 + +# qhasm: reg256 b7 + +# qhasm: reg256 b8 + +# qhasm: reg256 b9 + +# qhasm: reg256 b10 + +# qhasm: reg256 b11 + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r + +# qhasm: reg128 h0 + +# qhasm: reg128 h1 + +# qhasm: reg128 h2 + +# qhasm: reg128 h3 + +# qhasm: reg128 h4 + +# qhasm: reg128 h5 + +# qhasm: reg128 h6 + +# qhasm: reg128 h7 + +# qhasm: reg128 h8 + +# qhasm: reg128 h9 + +# qhasm: reg128 h10 + +# qhasm: reg128 h11 + +# qhasm: reg128 h12 + +# qhasm: reg128 h13 + +# qhasm: reg128 h14 + +# qhasm: reg128 h15 + +# qhasm: reg128 h16 + +# qhasm: reg128 h17 + +# qhasm: reg128 h18 + +# qhasm: reg128 h19 + +# qhasm: reg128 h20 + +# qhasm: reg128 h21 + +# qhasm: reg128 h22 + +# qhasm: stack4864 buf + +# qhasm: int64 ptr + +# qhasm: int64 tmp + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864_AVX_vec128_mul_asm +.global PQCLEAN_MCELIECE348864_AVX_vec128_mul_asm +_PQCLEAN_MCELIECE348864_AVX_vec128_mul_asm: +PQCLEAN_MCELIECE348864_AVX_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $608,%r11 +sub %r11,%rsp + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#4 +# asm 2: leaq ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: b11 = mem128[ input_2 + 176 ] x2 +# asm 1: vbroadcasti128 176(b11=reg256#1 +# asm 2: vbroadcasti128 176(b11=%ymm0 +vbroadcasti128 176(%rdx), %ymm0 + +# qhasm: a5[0] = mem128[ input_1 + 80 ] +# asm 1: vinsertf128 $0x0,80(r16=reg256#3 +# asm 2: vpand r16=%ymm2 +vpand %ymm0,%ymm1,%ymm2 + +# qhasm: mem256[ ptr + 512 ] = r16 +# asm 1: vmovupd r15=reg256#4 +# asm 2: vpand r15=%ymm3 +vpand %ymm0,%ymm2,%ymm3 + +# qhasm: a3[0] = mem128[ input_1 + 48 ] +# asm 1: vinsertf128 $0x0,48(r14=reg256#6 +# asm 2: vpand r14=%ymm5 +vpand %ymm0,%ymm4,%ymm5 + +# qhasm: a2[0] = mem128[ input_1 + 32 ] +# asm 1: vinsertf128 $0x0,32(r13=reg256#8 +# asm 2: vpand r13=%ymm7 +vpand %ymm0,%ymm6,%ymm7 + +# qhasm: a1[0] = mem128[ input_1 + 16 ] +# asm 1: vinsertf128 $0x0,16(r12=reg256#10 +# asm 2: vpand r12=%ymm9 +vpand %ymm0,%ymm8,%ymm9 + +# qhasm: a0[0] = mem128[ input_1 + 0 ] +# asm 1: vinsertf128 $0x0,0(r11=reg256#1 +# asm 2: vpand r11=%ymm0 +vpand %ymm0,%ymm10,%ymm0 + +# qhasm: b10 = mem128[ input_2 + 160 ] x2 +# asm 1: vbroadcasti128 160(b10=reg256#12 +# asm 2: vbroadcasti128 160(b10=%ymm11 +vbroadcasti128 160(%rdx), %ymm11 + +# qhasm: r = b10 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm2,%ymm3 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm4,%ymm3 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm6,%ymm3 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm8,%ymm3 + +# qhasm: r11 ^= r +# asm 1: vpxor r10=reg256#4 +# asm 2: vpand r10=%ymm3 +vpand %ymm11,%ymm10,%ymm3 + +# qhasm: b9 = mem128[ input_2 + 144 ] x2 +# asm 1: vbroadcasti128 144(b9=reg256#12 +# asm 2: vbroadcasti128 144(b9=%ymm11 +vbroadcasti128 144(%rdx), %ymm11 + +# qhasm: r = b9 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm2,%ymm5 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm4,%ymm5 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm6,%ymm5 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm8,%ymm5 + +# qhasm: r10 ^= r +# asm 1: vpxor r9=reg256#6 +# asm 2: vpand r9=%ymm5 +vpand %ymm11,%ymm10,%ymm5 + +# qhasm: b8 = mem128[ input_2 + 128 ] x2 +# asm 1: vbroadcasti128 128(b8=reg256#12 +# asm 2: vbroadcasti128 128(b8=%ymm11 +vbroadcasti128 128(%rdx), %ymm11 + +# qhasm: r = b8 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm2,%ymm7 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm4,%ymm7 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm6,%ymm7 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm8,%ymm7 + +# qhasm: r9 ^= r +# asm 1: vpxor r8=reg256#8 +# asm 2: vpand r8=%ymm7 +vpand %ymm11,%ymm10,%ymm7 + +# qhasm: b7 = mem128[ input_2 + 112 ] x2 +# asm 1: vbroadcasti128 112(b7=reg256#12 +# asm 2: vbroadcasti128 112(b7=%ymm11 +vbroadcasti128 112(%rdx), %ymm11 + +# qhasm: r = b7 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm2,%ymm9 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm4,%ymm9 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm6,%ymm9 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm8,%ymm9 + +# qhasm: r8 ^= r +# asm 1: vpxor r7=reg256#10 +# asm 2: vpand r7=%ymm9 +vpand %ymm11,%ymm10,%ymm9 + +# qhasm: b6 = mem128[ input_2 + 96 ] x2 +# asm 1: vbroadcasti128 96(b6=reg256#12 +# asm 2: vbroadcasti128 96(b6=%ymm11 +vbroadcasti128 96(%rdx), %ymm11 + +# qhasm: r = b6 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm2,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm4,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm6,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm8,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r6=reg256#1 +# asm 2: vpand r6=%ymm0 +vpand %ymm11,%ymm10,%ymm0 + +# qhasm: b5 = mem128[ input_2 + 80 ] x2 +# asm 1: vbroadcasti128 80(b5=reg256#12 +# asm 2: vbroadcasti128 80(b5=%ymm11 +vbroadcasti128 80(%rdx), %ymm11 + +# qhasm: r = b5 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm2,%ymm3 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm4,%ymm3 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm6,%ymm3 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm8,%ymm3 + +# qhasm: r6 ^= r +# asm 1: vpxor r5=reg256#4 +# asm 2: vpand r5=%ymm3 +vpand %ymm11,%ymm10,%ymm3 + +# qhasm: b4 = mem128[ input_2 + 64 ] x2 +# asm 1: vbroadcasti128 64(b4=reg256#12 +# asm 2: vbroadcasti128 64(b4=%ymm11 +vbroadcasti128 64(%rdx), %ymm11 + +# qhasm: r = b4 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm2,%ymm5 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm4,%ymm5 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm6,%ymm5 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm8,%ymm5 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#6 +# asm 2: vpand r4=%ymm5 +vpand %ymm11,%ymm10,%ymm5 + +# qhasm: b3 = mem128[ input_2 + 48 ] x2 +# asm 1: vbroadcasti128 48(b3=reg256#12 +# asm 2: vbroadcasti128 48(b3=%ymm11 +vbroadcasti128 48(%rdx), %ymm11 + +# qhasm: r = b3 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm2,%ymm7 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm4,%ymm7 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm6,%ymm7 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm8,%ymm7 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#8 +# asm 2: vpand r3=%ymm7 +vpand %ymm11,%ymm10,%ymm7 + +# qhasm: b2 = mem128[ input_2 + 32 ] x2 +# asm 1: vbroadcasti128 32(b2=reg256#12 +# asm 2: vbroadcasti128 32(b2=%ymm11 +vbroadcasti128 32(%rdx), %ymm11 + +# qhasm: r = b2 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm2,%ymm9 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm4,%ymm9 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm6,%ymm9 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm8,%ymm9 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#10 +# asm 2: vpand r2=%ymm9 +vpand %ymm11,%ymm10,%ymm9 + +# qhasm: b1 = mem128[ input_2 + 16 ] x2 +# asm 1: vbroadcasti128 16(b1=reg256#12 +# asm 2: vbroadcasti128 16(b1=%ymm11 +vbroadcasti128 16(%rdx), %ymm11 + +# qhasm: r = b1 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm2,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm4,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm6,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm8,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#1 +# asm 2: vpand r1=%ymm0 +vpand %ymm11,%ymm10,%ymm0 + +# qhasm: b0 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b0=reg256#12 +# asm 2: vbroadcasti128 0(b0=%ymm11 +vbroadcasti128 0(%rdx), %ymm11 + +# qhasm: r = b0 & a5 +# asm 1: vpand r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm11,%ymm1,%ymm1 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm11,%ymm2,%ymm1 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm11,%ymm4,%ymm1 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm11,%ymm6,%ymm1 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm11,%ymm8,%ymm1 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#2 +# asm 2: vpand r0=%ymm1 +vpand %ymm11,%ymm10,%ymm1 + +# qhasm: mem256[ ptr + 128 ] = r4 +# asm 1: vmovupd h22=reg128#1 +# asm 2: movdqu 528(h22=%xmm0 +movdqu 528(%rcx),%xmm0 + +# qhasm: h13 = h22 +# asm 1: movdqa h13=reg128#2 +# asm 2: movdqa h13=%xmm1 +movdqa %xmm0,%xmm1 + +# qhasm: h10 = h22 +# asm 1: movdqa h10=reg128#1 +# asm 2: movdqa h10=%xmm0 +movdqa %xmm0,%xmm0 + +# qhasm: h21 = mem128[ ptr + 496 ] +# asm 1: movdqu 496(h21=reg128#3 +# asm 2: movdqu 496(h21=%xmm2 +movdqu 496(%rcx),%xmm2 + +# qhasm: h12 = h21 +# asm 1: movdqa h12=reg128#4 +# asm 2: movdqa h12=%xmm3 +movdqa %xmm2,%xmm3 + +# qhasm: h9 = h21 +# asm 1: movdqa h9=reg128#3 +# asm 2: movdqa h9=%xmm2 +movdqa %xmm2,%xmm2 + +# qhasm: h20 = mem128[ ptr + 464 ] +# asm 1: movdqu 464(h20=reg128#5 +# asm 2: movdqu 464(h20=%xmm4 +movdqu 464(%rcx),%xmm4 + +# qhasm: h11 = h20 +# asm 1: movdqa h11=reg128#6 +# asm 2: movdqa h11=%xmm5 +movdqa %xmm4,%xmm5 + +# qhasm: h8 = h20 +# asm 1: movdqa h8=reg128#5 +# asm 2: movdqa h8=%xmm4 +movdqa %xmm4,%xmm4 + +# qhasm: h19 = mem128[ ptr + 432 ] +# asm 1: movdqu 432(h19=reg128#7 +# asm 2: movdqu 432(h19=%xmm6 +movdqu 432(%rcx),%xmm6 + +# qhasm: h10 = h10 ^ h19 +# asm 1: vpxor h10=reg128#1 +# asm 2: vpxor h10=%xmm0 +vpxor %xmm6,%xmm0,%xmm0 + +# qhasm: h7 = h19 +# asm 1: movdqa h7=reg128#7 +# asm 2: movdqa h7=%xmm6 +movdqa %xmm6,%xmm6 + +# qhasm: h18 = mem128[ ptr + 400 ] +# asm 1: movdqu 400(h18=reg128#8 +# asm 2: movdqu 400(h18=%xmm7 +movdqu 400(%rcx),%xmm7 + +# qhasm: h9 = h9 ^ h18 +# asm 1: vpxor h9=reg128#3 +# asm 2: vpxor h9=%xmm2 +vpxor %xmm7,%xmm2,%xmm2 + +# qhasm: h6 = h18 +# asm 1: movdqa h6=reg128#8 +# asm 2: movdqa h6=%xmm7 +movdqa %xmm7,%xmm7 + +# qhasm: h17 = mem128[ ptr + 368 ] +# asm 1: movdqu 368(h17=reg128#9 +# asm 2: movdqu 368(h17=%xmm8 +movdqu 368(%rcx),%xmm8 + +# qhasm: h8 = h8 ^ h17 +# asm 1: vpxor h8=reg128#5 +# asm 2: vpxor h8=%xmm4 +vpxor %xmm8,%xmm4,%xmm4 + +# qhasm: h5 = h17 +# asm 1: movdqa h5=reg128#9 +# asm 2: movdqa h5=%xmm8 +movdqa %xmm8,%xmm8 + +# qhasm: h16 = mem128[ ptr + 336 ] +# asm 1: movdqu 336(h16=reg128#10 +# asm 2: movdqu 336(h16=%xmm9 +movdqu 336(%rcx),%xmm9 + +# qhasm: h16 = h16 ^ mem128[ ptr + 512 ] +# asm 1: vpxor 512(h16=reg128#10 +# asm 2: vpxor 512(h16=%xmm9 +vpxor 512(%rcx),%xmm9,%xmm9 + +# qhasm: h7 = h7 ^ h16 +# asm 1: vpxor h7=reg128#7 +# asm 2: vpxor h7=%xmm6 +vpxor %xmm9,%xmm6,%xmm6 + +# qhasm: h4 = h16 +# asm 1: movdqa h4=reg128#10 +# asm 2: movdqa h4=%xmm9 +movdqa %xmm9,%xmm9 + +# qhasm: h15 = mem128[ ptr + 304 ] +# asm 1: movdqu 304(h15=reg128#11 +# asm 2: movdqu 304(h15=%xmm10 +movdqu 304(%rcx),%xmm10 + +# qhasm: h15 = h15 ^ mem128[ ptr + 480 ] +# asm 1: vpxor 480(h15=reg128#11 +# asm 2: vpxor 480(h15=%xmm10 +vpxor 480(%rcx),%xmm10,%xmm10 + +# qhasm: h6 = h6 ^ h15 +# asm 1: vpxor h6=reg128#8 +# asm 2: vpxor h6=%xmm7 +vpxor %xmm10,%xmm7,%xmm7 + +# qhasm: h3 = h15 +# asm 1: movdqa h3=reg128#11 +# asm 2: movdqa h3=%xmm10 +movdqa %xmm10,%xmm10 + +# qhasm: h14 = mem128[ ptr + 272 ] +# asm 1: movdqu 272(h14=reg128#12 +# asm 2: movdqu 272(h14=%xmm11 +movdqu 272(%rcx),%xmm11 + +# qhasm: h14 = h14 ^ mem128[ ptr + 448 ] +# asm 1: vpxor 448(h14=reg128#12 +# asm 2: vpxor 448(h14=%xmm11 +vpxor 448(%rcx),%xmm11,%xmm11 + +# qhasm: h5 = h5 ^ h14 +# asm 1: vpxor h5=reg128#9 +# asm 2: vpxor h5=%xmm8 +vpxor %xmm11,%xmm8,%xmm8 + +# qhasm: h2 = h14 +# asm 1: movdqa h2=reg128#12 +# asm 2: movdqa h2=%xmm11 +movdqa %xmm11,%xmm11 + +# qhasm: h13 = h13 ^ mem128[ ptr + 240 ] +# asm 1: vpxor 240(h13=reg128#2 +# asm 2: vpxor 240(h13=%xmm1 +vpxor 240(%rcx),%xmm1,%xmm1 + +# qhasm: h13 = h13 ^ mem128[ ptr + 416 ] +# asm 1: vpxor 416(h13=reg128#2 +# asm 2: vpxor 416(h13=%xmm1 +vpxor 416(%rcx),%xmm1,%xmm1 + +# qhasm: h4 = h4 ^ h13 +# asm 1: vpxor h4=reg128#10 +# asm 2: vpxor h4=%xmm9 +vpxor %xmm1,%xmm9,%xmm9 + +# qhasm: h1 = h13 +# asm 1: movdqa h1=reg128#2 +# asm 2: movdqa h1=%xmm1 +movdqa %xmm1,%xmm1 + +# qhasm: h12 = h12 ^ mem128[ ptr + 208 ] +# asm 1: vpxor 208(h12=reg128#4 +# asm 2: vpxor 208(h12=%xmm3 +vpxor 208(%rcx),%xmm3,%xmm3 + +# qhasm: h12 = h12 ^ mem128[ ptr + 384 ] +# asm 1: vpxor 384(h12=reg128#4 +# asm 2: vpxor 384(h12=%xmm3 +vpxor 384(%rcx),%xmm3,%xmm3 + +# qhasm: h3 = h3 ^ h12 +# asm 1: vpxor h3=reg128#11 +# asm 2: vpxor h3=%xmm10 +vpxor %xmm3,%xmm10,%xmm10 + +# qhasm: h0 = h12 +# asm 1: movdqa h0=reg128#4 +# asm 2: movdqa h0=%xmm3 +movdqa %xmm3,%xmm3 + +# qhasm: h11 = h11 ^ mem128[ ptr + 352 ] +# asm 1: vpxor 352(h11=reg128#6 +# asm 2: vpxor 352(h11=%xmm5 +vpxor 352(%rcx),%xmm5,%xmm5 + +# qhasm: h11 = h11 ^ mem128[ ptr + 176 ] +# asm 1: vpxor 176(h11=reg128#6 +# asm 2: vpxor 176(h11=%xmm5 +vpxor 176(%rcx),%xmm5,%xmm5 + +# qhasm: mem128[ input_0 + 176 ] = h11 +# asm 1: movdqu h10=reg128#1 +# asm 2: vpxor 320(h10=%xmm0 +vpxor 320(%rcx),%xmm0,%xmm0 + +# qhasm: h10 = h10 ^ mem128[ ptr + 144 ] +# asm 1: vpxor 144(h10=reg128#1 +# asm 2: vpxor 144(h10=%xmm0 +vpxor 144(%rcx),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 160 ] = h10 +# asm 1: movdqu h9=reg128#1 +# asm 2: vpxor 288(h9=%xmm0 +vpxor 288(%rcx),%xmm2,%xmm0 + +# qhasm: h9 = h9 ^ mem128[ ptr + 112 ] +# asm 1: vpxor 112(h9=reg128#1 +# asm 2: vpxor 112(h9=%xmm0 +vpxor 112(%rcx),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 144 ] = h9 +# asm 1: movdqu h8=reg128#1 +# asm 2: vpxor 256(h8=%xmm0 +vpxor 256(%rcx),%xmm4,%xmm0 + +# qhasm: h8 = h8 ^ mem128[ ptr + 80 ] +# asm 1: vpxor 80(h8=reg128#1 +# asm 2: vpxor 80(h8=%xmm0 +vpxor 80(%rcx),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 128 ] = h8 +# asm 1: movdqu h7=reg128#1 +# asm 2: vpxor 224(h7=%xmm0 +vpxor 224(%rcx),%xmm6,%xmm0 + +# qhasm: h7 = h7 ^ mem128[ ptr + 48 ] +# asm 1: vpxor 48(h7=reg128#1 +# asm 2: vpxor 48(h7=%xmm0 +vpxor 48(%rcx),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 112 ] = h7 +# asm 1: movdqu h6=reg128#1 +# asm 2: vpxor 192(h6=%xmm0 +vpxor 192(%rcx),%xmm7,%xmm0 + +# qhasm: h6 = h6 ^ mem128[ ptr + 16 ] +# asm 1: vpxor 16(h6=reg128#1 +# asm 2: vpxor 16(h6=%xmm0 +vpxor 16(%rcx),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 96 ] = h6 +# asm 1: movdqu h5=reg128#1 +# asm 2: vpxor 160(h5=%xmm0 +vpxor 160(%rcx),%xmm8,%xmm0 + +# qhasm: mem128[ input_0 + 80 ] = h5 +# asm 1: movdqu h4=reg128#1 +# asm 2: vpxor 128(h4=%xmm0 +vpxor 128(%rcx),%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 64 ] = h4 +# asm 1: movdqu h3=reg128#1 +# asm 2: vpxor 96(h3=%xmm0 +vpxor 96(%rcx),%xmm10,%xmm0 + +# qhasm: mem128[ input_0 + 48 ] = h3 +# asm 1: movdqu h2=reg128#1 +# asm 2: vpxor 64(h2=%xmm0 +vpxor 64(%rcx),%xmm11,%xmm0 + +# qhasm: mem128[ input_0 + 32 ] = h2 +# asm 1: movdqu h1=reg128#1 +# asm 2: vpxor 32(h1=%xmm0 +vpxor 32(%rcx),%xmm1,%xmm0 + +# qhasm: mem128[ input_0 + 16 ] = h1 +# asm 1: movdqu h0=reg128#1 +# asm 2: vpxor 0(h0=%xmm0 +vpxor 0(%rcx),%xmm3,%xmm0 + +# qhasm: mem128[ input_0 + 0 ] = h0 +# asm 1: movdqu + +typedef __m256i vec256; + +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_set1_16b(uint16_t a); +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_setzero(void); +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_set4x(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); + +// Extract requires a literal argument so need to be macros +#define PQCLEAN_MCELIECE348864_AVX_vec256_extract2x(a,i) ((vec128) _mm256_extractf128_si256((vec256) (a),(i))) +#define PQCLEAN_MCELIECE348864_AVX_vec256_extract(a,i) ((uint64_t) _mm256_extract_epi64((vec256) (a),(i))) + +int PQCLEAN_MCELIECE348864_AVX_vec256_testz(vec256 a); +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_and(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_xor(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_or(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_sll_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_srl_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_unpack_low_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_unpack_high_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864_AVX_vec256_or_reduce(const vec256 *a); +void PQCLEAN_MCELIECE348864_AVX_vec256_copy(vec256 *dest, const vec256 *src); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE348864_AVX_vec256_mul(vec256 *h, vec256 *f, const vec256 *g); +void PQCLEAN_MCELIECE348864_AVX_vec256_sq(vec256 * /*out*/, const vec256 * /*in*/); +void PQCLEAN_MCELIECE348864_AVX_vec256_inv(vec256 * /*out*/, const vec256 * /*in*/); + +extern void PQCLEAN_MCELIECE348864_AVX_vec256_maa_asm(vec256 *, vec256 *, const vec256 *); +extern void PQCLEAN_MCELIECE348864_AVX_vec256_ama_asm(vec256 *, vec256 *, const vec256 *); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec256_mul_asm.S b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec256_mul_asm.S new file mode 100644 index 0000000000..5df2bcd737 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec256_mul_asm.S @@ -0,0 +1,1736 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 a7 + +# qhasm: reg256 a8 + +# qhasm: reg256 a9 + +# qhasm: reg256 a10 + +# qhasm: reg256 a11 + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r + +# qhasm: enter vec256_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864_AVX_vec256_mul_asm +.global PQCLEAN_MCELIECE348864_AVX_vec256_mul_asm +_PQCLEAN_MCELIECE348864_AVX_vec256_mul_asm: +PQCLEAN_MCELIECE348864_AVX_vec256_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#2 +# asm 2: vmovupd 352(a11=%ymm1 +vmovupd 352(%rsi),%ymm1 + +# qhasm: r11 = a11 & b0 +# asm 1: vpand r11=reg256#3 +# asm 2: vpand r11=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r12 = a11 & mem256[input_2 + 32] +# asm 1: vpand 32(r12=reg256#4 +# asm 2: vpand 32(r12=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r13 = a11 & mem256[input_2 + 64] +# asm 1: vpand 64(r13=reg256#5 +# asm 2: vpand 64(r13=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r14 = a11 & mem256[input_2 + 96] +# asm 1: vpand 96(r14=reg256#6 +# asm 2: vpand 96(r14=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r15 = a11 & mem256[input_2 + 128] +# asm 1: vpand 128(r15=reg256#7 +# asm 2: vpand 128(r15=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r16 = a11 & mem256[input_2 + 160] +# asm 1: vpand 160(r16=reg256#8 +# asm 2: vpand 160(r16=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r17 = a11 & mem256[input_2 + 192] +# asm 1: vpand 192(r17=reg256#9 +# asm 2: vpand 192(r17=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r18 = a11 & mem256[input_2 + 224] +# asm 1: vpand 224(r18=reg256#10 +# asm 2: vpand 224(r18=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r19 = a11 & mem256[input_2 + 256] +# asm 1: vpand 256(r19=reg256#11 +# asm 2: vpand 256(r19=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r20 = a11 & mem256[input_2 + 288] +# asm 1: vpand 288(r20=reg256#12 +# asm 2: vpand 288(r20=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r21 = a11 & mem256[input_2 + 320] +# asm 1: vpand 320(r21=reg256#13 +# asm 2: vpand 320(r21=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r22 = a11 & mem256[input_2 + 352] +# asm 1: vpand 352(r22=reg256#2 +# asm 2: vpand 352(r22=%ymm1 +vpand 352(%rdx),%ymm1,%ymm1 + +# qhasm: r13 ^= r22 +# asm 1: vpxor r10=reg256#2 +# asm 2: vmovapd r10=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#14 +# asm 2: vmovupd 320(a10=%ymm13 +vmovupd 320(%rsi),%ymm13 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r21 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#14 +# asm 2: vmovupd 288(a9=%ymm13 +vmovupd 288(%rsi),%ymm13 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r20 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#14 +# asm 2: vmovupd 256(a8=%ymm13 +vmovupd 256(%rsi),%ymm13 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r19 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#14 +# asm 2: vmovupd 224(a7=%ymm13 +vmovupd 224(%rsi),%ymm13 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r18 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#14 +# asm 2: vmovupd 192(a6=%ymm13 +vmovupd 192(%rsi),%ymm13 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r17 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#14 +# asm 2: vmovupd 160(a5=%ymm13 +vmovupd 160(%rsi),%ymm13 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r16 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#14 +# asm 2: vmovupd 128(a4=%ymm13 +vmovupd 128(%rsi),%ymm13 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r15 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#14 +# asm 2: vmovupd 96(a3=%ymm13 +vmovupd 96(%rsi),%ymm13 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r14 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#14 +# asm 2: vmovupd 64(a2=%ymm13 +vmovupd 64(%rsi),%ymm13 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r13 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#14 +# asm 2: vmovupd 32(a1=%ymm13 +vmovupd 32(%rsi),%ymm13 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r12 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#14 +# asm 2: vmovupd 0(a0=%ymm13 +vmovupd 0(%rsi),%ymm13 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm13,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm13,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm13,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm13,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm13,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm13,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm13,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm13,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm13,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm13,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm13,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r11_stack=stack64#1 +# asm 2: movq r11_stack=608(%rsp) +movq %r11,608(%rsp) + +# qhasm: r12_stack = caller_r12 +# asm 1: movq r12_stack=stack64#2 +# asm 2: movq r12_stack=616(%rsp) +movq %r12,616(%rsp) + +# qhasm: r13_stack = caller_r13 +# asm 1: movq r13_stack=stack64#3 +# asm 2: movq r13_stack=624(%rsp) +movq %r13,624(%rsp) + +# qhasm: r14_stack = caller_r14 +# asm 1: movq r14_stack=stack64#4 +# asm 2: movq r14_stack=632(%rsp) +movq %r14,632(%rsp) + +# qhasm: r15_stack = caller_r15 +# asm 1: movq r15_stack=stack64#5 +# asm 2: movq r15_stack=640(%rsp) +movq %r15,640(%rsp) + +# qhasm: rbx_stack = caller_rbx +# asm 1: movq rbx_stack=stack64#6 +# asm 2: movq rbx_stack=648(%rsp) +movq %rbx,648(%rsp) + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#4 +# asm 2: leaq ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: s0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(s0=reg256#1 +# asm 2: vmovupd 0(s0=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: s1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(s1=reg256#2 +# asm 2: vmovupd 32(s1=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: s2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(s2=reg256#3 +# asm 2: vmovupd 64(s2=%ymm2 +vmovupd 64(%rsi),%ymm2 + +# qhasm: t0 = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(t0=reg256#4 +# asm 2: vmovupd 0(t0=%ymm3 +vmovupd 0(%rdx),%ymm3 + +# qhasm: t1 = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(t1=reg256#5 +# asm 2: vmovupd 32(t1=%ymm4 +vmovupd 32(%rdx),%ymm4 + +# qhasm: t2 = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(t2=reg256#6 +# asm 2: vmovupd 64(t2=%ymm5 +vmovupd 64(%rdx),%ymm5 + +# qhasm: a5[0,1,2,3] = s2[2,2,3,3] +# asm 1: vpermq $0xfa,a5=reg256#7 +# asm 2: vpermq $0xfa,a5=%ymm6 +vpermq $0xfa,%ymm2,%ymm6 + +# qhasm: b5[0,1,2,3] = t2[2,3,2,3] +# asm 1: vpermq $0xee,b5=reg256#8 +# asm 2: vpermq $0xee,b5=%ymm7 +vpermq $0xee,%ymm5,%ymm7 + +# qhasm: r10 = a5 & b5 +# asm 1: vpand r10=reg256#9 +# asm 2: vpand r10=%ymm8 +vpand %ymm6,%ymm7,%ymm8 + +# qhasm: mem256[ ptr + 320 ] = r10 +# asm 1: vmovupd b4=reg256#6 +# asm 2: vpermq $0x44,b4=%ymm5 +vpermq $0x44,%ymm5,%ymm5 + +# qhasm: r9 = a5 & b4 +# asm 1: vpand r9=reg256#9 +# asm 2: vpand r9=%ymm8 +vpand %ymm6,%ymm5,%ymm8 + +# qhasm: b3[0,1,2,3] = t1[2,3,2,3] +# asm 1: vpermq $0xee,b3=reg256#10 +# asm 2: vpermq $0xee,b3=%ymm9 +vpermq $0xee,%ymm4,%ymm9 + +# qhasm: r8 = a5 & b3 +# asm 1: vpand r8=reg256#11 +# asm 2: vpand r8=%ymm10 +vpand %ymm6,%ymm9,%ymm10 + +# qhasm: b2[0,1,2,3] = t1[0,1,0,1] +# asm 1: vpermq $0x44,b2=reg256#5 +# asm 2: vpermq $0x44,b2=%ymm4 +vpermq $0x44,%ymm4,%ymm4 + +# qhasm: r7 = a5 & b2 +# asm 1: vpand r7=reg256#12 +# asm 2: vpand r7=%ymm11 +vpand %ymm6,%ymm4,%ymm11 + +# qhasm: b1[0,1,2,3] = t0[2,3,2,3] +# asm 1: vpermq $0xee,b1=reg256#13 +# asm 2: vpermq $0xee,b1=%ymm12 +vpermq $0xee,%ymm3,%ymm12 + +# qhasm: r6 = a5 & b1 +# asm 1: vpand r6=reg256#14 +# asm 2: vpand r6=%ymm13 +vpand %ymm6,%ymm12,%ymm13 + +# qhasm: b0[0,1,2,3] = t0[0,1,0,1] +# asm 1: vpermq $0x44,b0=reg256#4 +# asm 2: vpermq $0x44,b0=%ymm3 +vpermq $0x44,%ymm3,%ymm3 + +# qhasm: r5 = a5 & b0 +# asm 1: vpand r5=reg256#7 +# asm 2: vpand r5=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: a4[0,1,2,3] = s2[0,0,1,1] +# asm 1: vpermq $0x50,a4=reg256#3 +# asm 2: vpermq $0x50,a4=%ymm2 +vpermq $0x50,%ymm2,%ymm2 + +# qhasm: r = a4 & b5 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm2,%ymm7,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm2,%ymm5,%ymm8 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm2,%ymm9,%ymm8 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm2,%ymm4,%ymm8 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm2,%ymm12,%ymm8 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#3 +# asm 2: vpand r4=%ymm2 +vpand %ymm2,%ymm3,%ymm2 + +# qhasm: a3[0,1,2,3] = s1[2,2,3,3] +# asm 1: vpermq $0xfa,a3=reg256#9 +# asm 2: vpermq $0xfa,a3=%ymm8 +vpermq $0xfa,%ymm1,%ymm8 + +# qhasm: r = a3 & b5 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm8,%ymm7,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm5,%ymm10 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm9,%ymm10 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm4,%ymm10 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm12,%ymm10 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#9 +# asm 2: vpand r3=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: a2[0,1,2,3] = s1[0,0,1,1] +# asm 1: vpermq $0x50,a2=reg256#2 +# asm 2: vpermq $0x50,a2=%ymm1 +vpermq $0x50,%ymm1,%ymm1 + +# qhasm: r = a2 & b5 +# asm 1: vpand r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm1,%ymm7,%ymm10 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm1,%ymm5,%ymm10 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm1,%ymm9,%ymm10 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm1,%ymm4,%ymm10 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm1,%ymm12,%ymm10 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#2 +# asm 2: vpand r2=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: a1[0,1,2,3] = s0[2,2,3,3] +# asm 1: vpermq $0xfa,a1=reg256#11 +# asm 2: vpermq $0xfa,a1=%ymm10 +vpermq $0xfa,%ymm0,%ymm10 + +# qhasm: r = a1 & b5 +# asm 1: vpand r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm10,%ymm7,%ymm11 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm10,%ymm5,%ymm11 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm10,%ymm9,%ymm11 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm10,%ymm4,%ymm11 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm10,%ymm12,%ymm11 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#11 +# asm 2: vpand r1=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: a0[0,1,2,3] = s0[0,0,1,1] +# asm 1: vpermq $0x50,a0=reg256#1 +# asm 2: vpermq $0x50,a0=%ymm0 +vpermq $0x50,%ymm0,%ymm0 + +# qhasm: r = a0 & b5 +# asm 1: vpand r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm0,%ymm7,%ymm7 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm0,%ymm5,%ymm5 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm0,%ymm9,%ymm5 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#5 +# asm 2: vpand r=%ymm4 +vpand %ymm0,%ymm4,%ymm4 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#5 +# asm 2: vpand r=%ymm4 +vpand %ymm0,%ymm12,%ymm4 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#1 +# asm 2: vpand r0=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: mem256[ ptr + 128 ] = r4 +# asm 1: vmovupd h22=int64#2 +# asm 2: movq 344(h22=%rsi +movq 344(%rcx),%rsi + +# qhasm: h13 = h22 +# asm 1: mov h13=int64#3 +# asm 2: mov h13=%rdx +mov %rsi,%rdx + +# qhasm: h10 = h22 +# asm 1: mov h10=int64#2 +# asm 2: mov h10=%rsi +mov %rsi,%rsi + +# qhasm: h21 = mem64[ ptr + 336 ] +# asm 1: movq 336(h21=int64#5 +# asm 2: movq 336(h21=%r8 +movq 336(%rcx),%r8 + +# qhasm: h21 ^= *(uint64 *) ( ptr + 328 ) +# asm 1: xorq 328(h12=int64#6 +# asm 2: mov h12=%r9 +mov %r8,%r9 + +# qhasm: h9 = h21 +# asm 1: mov h9=int64#5 +# asm 2: mov h9=%r8 +mov %r8,%r8 + +# qhasm: h20 = mem64[ ptr + 312 ] +# asm 1: movq 312(h20=int64#7 +# asm 2: movq 312(h20=%rax +movq 312(%rcx),%rax + +# qhasm: h20 ^= *(uint64 *) ( ptr + 320 ) +# asm 1: xorq 320(h11=int64#8 +# asm 2: mov h11=%r10 +mov %rax,%r10 + +# qhasm: h8 = h20 +# asm 1: mov h8=int64#7 +# asm 2: mov h8=%rax +mov %rax,%rax + +# qhasm: h19 = mem64[ ptr + 304 ] +# asm 1: movq 304(h19=int64#9 +# asm 2: movq 304(h19=%r11 +movq 304(%rcx),%r11 + +# qhasm: h19 ^= *(uint64 *) ( ptr + 296 ) +# asm 1: xorq 296(h7=int64#9 +# asm 2: mov h7=%r11 +mov %r11,%r11 + +# qhasm: h18 = mem64[ ptr + 280 ] +# asm 1: movq 280(h18=int64#10 +# asm 2: movq 280(h18=%r12 +movq 280(%rcx),%r12 + +# qhasm: h18 ^= *(uint64 *) ( ptr + 288 ) +# asm 1: xorq 288(h6=int64#10 +# asm 2: mov h6=%r12 +mov %r12,%r12 + +# qhasm: h17 = mem64[ ptr + 272 ] +# asm 1: movq 272(h17=int64#11 +# asm 2: movq 272(h17=%r13 +movq 272(%rcx),%r13 + +# qhasm: h17 ^= *(uint64 *) ( ptr + 264 ) +# asm 1: xorq 264(h5=int64#11 +# asm 2: mov h5=%r13 +mov %r13,%r13 + +# qhasm: h16 = mem64[ ptr + 248 ] +# asm 1: movq 248(h16=int64#12 +# asm 2: movq 248(h16=%r14 +movq 248(%rcx),%r14 + +# qhasm: h16 ^= *(uint64 *) ( ptr + 256 ) +# asm 1: xorq 256(h4=int64#12 +# asm 2: mov h4=%r14 +mov %r14,%r14 + +# qhasm: h15 = mem64[ ptr + 240 ] +# asm 1: movq 240(h15=int64#13 +# asm 2: movq 240(h15=%r15 +movq 240(%rcx),%r15 + +# qhasm: h15 ^= *(uint64 *) ( ptr + 232 ) +# asm 1: xorq 232(h3=int64#13 +# asm 2: mov h3=%r15 +mov %r15,%r15 + +# qhasm: h14 = mem64[ ptr + 216 ] +# asm 1: movq 216(h14=int64#14 +# asm 2: movq 216(h14=%rbx +movq 216(%rcx),%rbx + +# qhasm: h14 ^= *(uint64 *) ( ptr + 224 ) +# asm 1: xorq 224(h2=int64#14 +# asm 2: mov h2=%rbx +mov %rbx,%rbx + +# qhasm: h13 ^= *(uint64 *) ( ptr + 208 ) +# asm 1: xorq 208(h1=int64#3 +# asm 2: mov h1=%rdx +mov %rdx,%rdx + +# qhasm: h12 ^= *(uint64 *) ( ptr + 184 ) +# asm 1: xorq 184(h0=int64#6 +# asm 2: mov h0=%r9 +mov %r9,%r9 + +# qhasm: h11 ^= *(uint64 *) ( ptr + 176 ) +# asm 1: xorq 176(caller_r11=int64#9 +# asm 2: movq caller_r11=%r11 +movq 608(%rsp),%r11 + +# qhasm: caller_r12 = r12_stack +# asm 1: movq caller_r12=int64#10 +# asm 2: movq caller_r12=%r12 +movq 616(%rsp),%r12 + +# qhasm: caller_r13 = r13_stack +# asm 1: movq caller_r13=int64#11 +# asm 2: movq caller_r13=%r13 +movq 624(%rsp),%r13 + +# qhasm: caller_r14 = r14_stack +# asm 1: movq caller_r14=int64#12 +# asm 2: movq caller_r14=%r14 +movq 632(%rsp),%r14 + +# qhasm: caller_r15 = r15_stack +# asm 1: movq caller_r15=int64#13 +# asm 2: movq caller_r15=%r15 +movq 640(%rsp),%r15 + +# qhasm: caller_rbx = rbx_stack +# asm 1: movq caller_rbx=int64#14 +# asm 2: movq caller_rbx=%rbx +movq 648(%rsp),%rbx + +# qhasm: return +add %r11,%rsp +ret diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec_mul_sp_asm.S b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec_mul_sp_asm.S new file mode 100644 index 0000000000..0df3f5217d --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec_mul_sp_asm.S @@ -0,0 +1,1115 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 s0 + +# qhasm: reg256 s1 + +# qhasm: reg256 s2 + +# qhasm: reg256 s3 + +# qhasm: reg256 s4 + +# qhasm: reg256 s5 + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 b2 + +# qhasm: reg256 b3 + +# qhasm: reg256 b4 + +# qhasm: reg256 b5 + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r + +# qhasm: int64 h0 + +# qhasm: int64 h1 + +# qhasm: int64 h2 + +# qhasm: int64 h3 + +# qhasm: int64 h4 + +# qhasm: int64 h5 + +# qhasm: int64 h6 + +# qhasm: int64 h7 + +# qhasm: int64 h8 + +# qhasm: int64 h9 + +# qhasm: int64 h10 + +# qhasm: int64 h11 + +# qhasm: int64 h12 + +# qhasm: int64 h13 + +# qhasm: int64 h14 + +# qhasm: int64 h15 + +# qhasm: int64 h16 + +# qhasm: int64 h17 + +# qhasm: int64 h18 + +# qhasm: int64 h19 + +# qhasm: int64 h20 + +# qhasm: int64 h21 + +# qhasm: int64 h22 + +# qhasm: stack4864 buf + +# qhasm: int64 ptr + +# qhasm: int64 tmp + +# qhasm: stack64 r11_stack + +# qhasm: stack64 r12_stack + +# qhasm: stack64 r13_stack + +# qhasm: stack64 r14_stack + +# qhasm: stack64 r15_stack + +# qhasm: stack64 rbx_stack + +# qhasm: stack64 rbp_stack + +# qhasm: enter vec_mul_sp_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864_AVX_vec_mul_sp_asm +.global PQCLEAN_MCELIECE348864_AVX_vec_mul_sp_asm +_PQCLEAN_MCELIECE348864_AVX_vec_mul_sp_asm: +PQCLEAN_MCELIECE348864_AVX_vec_mul_sp_asm: +mov %rsp,%r11 +and $31,%r11 +add $672,%r11 +sub %r11,%rsp + +# qhasm: r11_stack = caller_r11 +# asm 1: movq r11_stack=stack64#1 +# asm 2: movq r11_stack=608(%rsp) +movq %r11,608(%rsp) + +# qhasm: r12_stack = caller_r12 +# asm 1: movq r12_stack=stack64#2 +# asm 2: movq r12_stack=616(%rsp) +movq %r12,616(%rsp) + +# qhasm: r13_stack = caller_r13 +# asm 1: movq r13_stack=stack64#3 +# asm 2: movq r13_stack=624(%rsp) +movq %r13,624(%rsp) + +# qhasm: r14_stack = caller_r14 +# asm 1: movq r14_stack=stack64#4 +# asm 2: movq r14_stack=632(%rsp) +movq %r14,632(%rsp) + +# qhasm: r15_stack = caller_r15 +# asm 1: movq r15_stack=stack64#5 +# asm 2: movq r15_stack=640(%rsp) +movq %r15,640(%rsp) + +# qhasm: rbx_stack = caller_rbx +# asm 1: movq rbx_stack=stack64#6 +# asm 2: movq rbx_stack=648(%rsp) +movq %rbx,648(%rsp) + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#4 +# asm 2: leaq ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: s0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(s0=reg256#1 +# asm 2: vmovupd 0(s0=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: s1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(s1=reg256#2 +# asm 2: vmovupd 32(s1=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: s2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(s2=reg256#3 +# asm 2: vmovupd 64(s2=%ymm2 +vmovupd 64(%rsi),%ymm2 + +# qhasm: a5[0,1,2,3] = s2[2,2,3,3] +# asm 1: vpermq $0xfa,a5=reg256#4 +# asm 2: vpermq $0xfa,a5=%ymm3 +vpermq $0xfa,%ymm2,%ymm3 + +# qhasm: r = mem256[ input_2 + 160 ] +# asm 1: vmovupd 160(r=reg256#5 +# asm 2: vmovupd 160(r=%ymm4 +vmovupd 160(%rdx),%ymm4 + +# qhasm: b5[0,1,2,3] = r[1,3,1,3] +# asm 1: vpermq $0xdd,b5=reg256#5 +# asm 2: vpermq $0xdd,b5=%ymm4 +vpermq $0xdd,%ymm4,%ymm4 + +# qhasm: r10 = a5 & b5 +# asm 1: vpand r10=reg256#6 +# asm 2: vpand r10=%ymm5 +vpand %ymm3,%ymm4,%ymm5 + +# qhasm: mem256[ ptr + 320 ] = r10 +# asm 1: vmovupd r=reg256#6 +# asm 2: vmovupd 128(r=%ymm5 +vmovupd 128(%rdx),%ymm5 + +# qhasm: b4[0,1,2,3] = r[1,3,1,3] +# asm 1: vpermq $0xdd,b4=reg256#6 +# asm 2: vpermq $0xdd,b4=%ymm5 +vpermq $0xdd,%ymm5,%ymm5 + +# qhasm: r9 = a5 & b4 +# asm 1: vpand r9=reg256#7 +# asm 2: vpand r9=%ymm6 +vpand %ymm3,%ymm5,%ymm6 + +# qhasm: r = mem256[ input_2 + 96 ] +# asm 1: vmovupd 96(r=reg256#8 +# asm 2: vmovupd 96(r=%ymm7 +vmovupd 96(%rdx),%ymm7 + +# qhasm: b3[0,1,2,3] = r[1,3,1,3] +# asm 1: vpermq $0xdd,b3=reg256#8 +# asm 2: vpermq $0xdd,b3=%ymm7 +vpermq $0xdd,%ymm7,%ymm7 + +# qhasm: r8 = a5 & b3 +# asm 1: vpand r8=reg256#9 +# asm 2: vpand r8=%ymm8 +vpand %ymm3,%ymm7,%ymm8 + +# qhasm: r = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(r=reg256#10 +# asm 2: vmovupd 64(r=%ymm9 +vmovupd 64(%rdx),%ymm9 + +# qhasm: b2[0,1,2,3] = r[1,3,1,3] +# asm 1: vpermq $0xdd,b2=reg256#10 +# asm 2: vpermq $0xdd,b2=%ymm9 +vpermq $0xdd,%ymm9,%ymm9 + +# qhasm: r7 = a5 & b2 +# asm 1: vpand r7=reg256#11 +# asm 2: vpand r7=%ymm10 +vpand %ymm3,%ymm9,%ymm10 + +# qhasm: r = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(r=reg256#12 +# asm 2: vmovupd 32(r=%ymm11 +vmovupd 32(%rdx),%ymm11 + +# qhasm: b1[0,1,2,3] = r[1,3,1,3] +# asm 1: vpermq $0xdd,b1=reg256#12 +# asm 2: vpermq $0xdd,b1=%ymm11 +vpermq $0xdd,%ymm11,%ymm11 + +# qhasm: r6 = a5 & b1 +# asm 1: vpand r6=reg256#13 +# asm 2: vpand r6=%ymm12 +vpand %ymm3,%ymm11,%ymm12 + +# qhasm: r = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(r=reg256#14 +# asm 2: vmovupd 0(r=%ymm13 +vmovupd 0(%rdx),%ymm13 + +# qhasm: b0[0,1,2,3] = r[1,3,1,3] +# asm 1: vpermq $0xdd,b0=reg256#14 +# asm 2: vpermq $0xdd,b0=%ymm13 +vpermq $0xdd,%ymm13,%ymm13 + +# qhasm: r5 = a5 & b0 +# asm 1: vpand r5=reg256#4 +# asm 2: vpand r5=%ymm3 +vpand %ymm3,%ymm13,%ymm3 + +# qhasm: a4[0,1,2,3] = s2[0,0,1,1] +# asm 1: vpermq $0x50,a4=reg256#3 +# asm 2: vpermq $0x50,a4=%ymm2 +vpermq $0x50,%ymm2,%ymm2 + +# qhasm: r = a4 & b5 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm2,%ymm4,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#7 +# asm 2: vpand r=%ymm6 +vpand %ymm2,%ymm5,%ymm6 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#7 +# asm 2: vpand r=%ymm6 +vpand %ymm2,%ymm7,%ymm6 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#7 +# asm 2: vpand r=%ymm6 +vpand %ymm2,%ymm9,%ymm6 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#7 +# asm 2: vpand r=%ymm6 +vpand %ymm2,%ymm11,%ymm6 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#3 +# asm 2: vpand r4=%ymm2 +vpand %ymm2,%ymm13,%ymm2 + +# qhasm: a3[0,1,2,3] = s1[2,2,3,3] +# asm 1: vpermq $0xfa,a3=reg256#7 +# asm 2: vpermq $0xfa,a3=%ymm6 +vpermq $0xfa,%ymm1,%ymm6 + +# qhasm: r = a3 & b5 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm6,%ymm4,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm6,%ymm5,%ymm8 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm6,%ymm7,%ymm8 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm6,%ymm9,%ymm8 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm6,%ymm11,%ymm8 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vpand r3=%ymm6 +vpand %ymm6,%ymm13,%ymm6 + +# qhasm: a2[0,1,2,3] = s1[0,0,1,1] +# asm 1: vpermq $0x50,a2=reg256#2 +# asm 2: vpermq $0x50,a2=%ymm1 +vpermq $0x50,%ymm1,%ymm1 + +# qhasm: r = a2 & b5 +# asm 1: vpand r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm1,%ymm4,%ymm8 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm1,%ymm5,%ymm8 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm1,%ymm7,%ymm8 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm1,%ymm9,%ymm8 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm1,%ymm11,%ymm8 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#2 +# asm 2: vpand r2=%ymm1 +vpand %ymm1,%ymm13,%ymm1 + +# qhasm: a1[0,1,2,3] = s0[2,2,3,3] +# asm 1: vpermq $0xfa,a1=reg256#9 +# asm 2: vpermq $0xfa,a1=%ymm8 +vpermq $0xfa,%ymm0,%ymm8 + +# qhasm: r = a1 & b5 +# asm 1: vpand r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm4,%ymm10 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm5,%ymm10 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm7,%ymm10 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm9,%ymm10 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm11,%ymm10 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#9 +# asm 2: vpand r1=%ymm8 +vpand %ymm8,%ymm13,%ymm8 + +# qhasm: a0[0,1,2,3] = s0[0,0,1,1] +# asm 1: vpermq $0x50,a0=reg256#1 +# asm 2: vpermq $0x50,a0=%ymm0 +vpermq $0x50,%ymm0,%ymm0 + +# qhasm: r = a0 & b5 +# asm 1: vpand r=reg256#5 +# asm 2: vpand r=%ymm4 +vpand %ymm0,%ymm4,%ymm4 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm0,%ymm5,%ymm3 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm0,%ymm7,%ymm3 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm0,%ymm9,%ymm3 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm0,%ymm11,%ymm3 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#1 +# asm 2: vpand r0=%ymm0 +vpand %ymm0,%ymm13,%ymm0 + +# qhasm: mem256[ ptr + 128 ] = r4 +# asm 1: vmovupd h22=int64#2 +# asm 2: movq 344(h22=%rsi +movq 344(%rcx),%rsi + +# qhasm: h13 = h22 +# asm 1: mov h13=int64#3 +# asm 2: mov h13=%rdx +mov %rsi,%rdx + +# qhasm: h10 = h22 +# asm 1: mov h10=int64#2 +# asm 2: mov h10=%rsi +mov %rsi,%rsi + +# qhasm: h21 = mem64[ ptr + 336 ] +# asm 1: movq 336(h21=int64#5 +# asm 2: movq 336(h21=%r8 +movq 336(%rcx),%r8 + +# qhasm: h21 ^= *(uint64 *) ( ptr + 328 ) +# asm 1: xorq 328(h12=int64#6 +# asm 2: mov h12=%r9 +mov %r8,%r9 + +# qhasm: h9 = h21 +# asm 1: mov h9=int64#5 +# asm 2: mov h9=%r8 +mov %r8,%r8 + +# qhasm: h20 = mem64[ ptr + 312 ] +# asm 1: movq 312(h20=int64#7 +# asm 2: movq 312(h20=%rax +movq 312(%rcx),%rax + +# qhasm: h20 ^= *(uint64 *) ( ptr + 320 ) +# asm 1: xorq 320(h11=int64#8 +# asm 2: mov h11=%r10 +mov %rax,%r10 + +# qhasm: h8 = h20 +# asm 1: mov h8=int64#7 +# asm 2: mov h8=%rax +mov %rax,%rax + +# qhasm: h19 = mem64[ ptr + 304 ] +# asm 1: movq 304(h19=int64#9 +# asm 2: movq 304(h19=%r11 +movq 304(%rcx),%r11 + +# qhasm: h19 ^= *(uint64 *) ( ptr + 296 ) +# asm 1: xorq 296(h7=int64#9 +# asm 2: mov h7=%r11 +mov %r11,%r11 + +# qhasm: h18 = mem64[ ptr + 280 ] +# asm 1: movq 280(h18=int64#10 +# asm 2: movq 280(h18=%r12 +movq 280(%rcx),%r12 + +# qhasm: h18 ^= *(uint64 *) ( ptr + 288 ) +# asm 1: xorq 288(h6=int64#10 +# asm 2: mov h6=%r12 +mov %r12,%r12 + +# qhasm: h17 = mem64[ ptr + 272 ] +# asm 1: movq 272(h17=int64#11 +# asm 2: movq 272(h17=%r13 +movq 272(%rcx),%r13 + +# qhasm: h17 ^= *(uint64 *) ( ptr + 264 ) +# asm 1: xorq 264(h5=int64#11 +# asm 2: mov h5=%r13 +mov %r13,%r13 + +# qhasm: h16 = mem64[ ptr + 248 ] +# asm 1: movq 248(h16=int64#12 +# asm 2: movq 248(h16=%r14 +movq 248(%rcx),%r14 + +# qhasm: h16 ^= *(uint64 *) ( ptr + 256 ) +# asm 1: xorq 256(h4=int64#12 +# asm 2: mov h4=%r14 +mov %r14,%r14 + +# qhasm: h15 = mem64[ ptr + 240 ] +# asm 1: movq 240(h15=int64#13 +# asm 2: movq 240(h15=%r15 +movq 240(%rcx),%r15 + +# qhasm: h15 ^= *(uint64 *) ( ptr + 232 ) +# asm 1: xorq 232(h3=int64#13 +# asm 2: mov h3=%r15 +mov %r15,%r15 + +# qhasm: h14 = mem64[ ptr + 216 ] +# asm 1: movq 216(h14=int64#14 +# asm 2: movq 216(h14=%rbx +movq 216(%rcx),%rbx + +# qhasm: h14 ^= *(uint64 *) ( ptr + 224 ) +# asm 1: xorq 224(h2=int64#14 +# asm 2: mov h2=%rbx +mov %rbx,%rbx + +# qhasm: h13 ^= *(uint64 *) ( ptr + 208 ) +# asm 1: xorq 208(h1=int64#3 +# asm 2: mov h1=%rdx +mov %rdx,%rdx + +# qhasm: h12 ^= *(uint64 *) ( ptr + 184 ) +# asm 1: xorq 184(h0=int64#6 +# asm 2: mov h0=%r9 +mov %r9,%r9 + +# qhasm: h11 ^= *(uint64 *) ( ptr + 176 ) +# asm 1: xorq 176(caller_r11=int64#9 +# asm 2: movq caller_r11=%r11 +movq 608(%rsp),%r11 + +# qhasm: caller_r12 = r12_stack +# asm 1: movq caller_r12=int64#10 +# asm 2: movq caller_r12=%r12 +movq 616(%rsp),%r12 + +# qhasm: caller_r13 = r13_stack +# asm 1: movq caller_r13=int64#11 +# asm 2: movq caller_r13=%r13 +movq 624(%rsp),%r13 + +# qhasm: caller_r14 = r14_stack +# asm 1: movq caller_r14=int64#12 +# asm 2: movq caller_r14=%r14 +movq 632(%rsp),%r14 + +# qhasm: caller_r15 = r15_stack +# asm 1: movq caller_r15=int64#13 +# asm 2: movq caller_r15=%r15 +movq 640(%rsp),%r15 + +# qhasm: caller_rbx = rbx_stack +# asm 1: movq caller_rbx=int64#14 +# asm 2: movq caller_rbx=%rbx +movq 648(%rsp),%rbx + +# qhasm: return +add %r11,%rsp +ret diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec_reduce_asm.S b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec_reduce_asm.S new file mode 100644 index 0000000000..9f07f500f2 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_avx/vec_reduce_asm.S @@ -0,0 +1,356 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 t + +# qhasm: int64 c + +# qhasm: int64 r + +# qhasm: enter vec_reduce_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm +.global PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm +_PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm: +PQCLEAN_MCELIECE348864_AVX_vec_reduce_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: r = 0 +# asm 1: mov $0,>r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t = mem64[ input_0 + 88 ] +# asm 1: movq 88(t=int64#2 +# asm 2: movq 88(t=%rsi +movq 88(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 80(t=%rsi +movq 80(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 72(t=%rsi +movq 72(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 64(t=%rsi +movq 64(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 56(t=%rsi +movq 56(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 48(t=%rsi +movq 48(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 40(t=%rsi +movq 40(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 32(t=%rsi +movq 32(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 24(t=%rsi +movq 24(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 16(t=%rsi +movq 16(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 8(t=%rsi +movq 8(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#1 +# asm 2: movq 0(t=%rdi +movq 0(%rdi),%rdi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rdi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE348864_CLEAN_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/api.h b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/api.h new file mode 100644 index 0000000000..0bebdcb571 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_API_H +#define PQCLEAN_MCELIECE348864_CLEAN_API_H + +#include + +#define PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_ALGNAME "Classic McEliece 348864" +#define PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_PUBLICKEYBYTES 261120 +#define PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_SECRETKEYBYTES 6452 +#define PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_CIPHERTEXTBYTES 128 +#define PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE348864_CLEAN_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE348864_CLEAN_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE348864_CLEAN_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/benes.c b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/benes.c new file mode 100644 index 0000000000..39f639a1ca --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/benes.c @@ -0,0 +1,139 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +/* one layer of the benes network */ +static void layer(uint64_t *data, uint64_t *bits, int lgs) { + int i, j, s; + + uint64_t d; + + s = 1 << lgs; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + + d = (data[j + 0] ^ data[j + s]); + d &= (*bits++); + data[j + 0] ^= d; + data[j + s] ^= d; + } + } +} + +/* input: r, sequence of bits to be permuted */ +/* bits, condition bits of the Benes network */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE348864_CLEAN_apply_benes(unsigned char *r, const unsigned char *bits, int rev) { + int i; + + const unsigned char *cond_ptr; + int inc, low; + + uint64_t bs[64]; + uint64_t cond[64]; + + // + + for (i = 0; i < 64; i++) { + bs[i] = PQCLEAN_MCELIECE348864_CLEAN_load8(r + i * 8); + } + + if (rev == 0) { + inc = 256; + cond_ptr = bits; + } else { + inc = -256; + cond_ptr = bits + (2 * GFBITS - 2) * 256; + } + + // + + PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(bs, bs); + + for (low = 0; low <= 5; low++) { + for (i = 0; i < 64; i++) { + cond[i] = PQCLEAN_MCELIECE348864_CLEAN_load4(cond_ptr + i * 4); + } + PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(cond, cond); + layer(bs, cond, low); + cond_ptr += inc; + } + + PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(bs, bs); + + for (low = 0; low <= 5; low++) { + for (i = 0; i < 32; i++) { + cond[i] = PQCLEAN_MCELIECE348864_CLEAN_load8(cond_ptr + i * 8); + } + layer(bs, cond, low); + cond_ptr += inc; + } + for (low = 4; low >= 0; low--) { + for (i = 0; i < 32; i++) { + cond[i] = PQCLEAN_MCELIECE348864_CLEAN_load8(cond_ptr + i * 8); + } + layer(bs, cond, low); + cond_ptr += inc; + } + + PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(bs, bs); + + for (low = 5; low >= 0; low--) { + for (i = 0; i < 64; i++) { + cond[i] = PQCLEAN_MCELIECE348864_CLEAN_load4(cond_ptr + i * 4); + } + PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(cond, cond); + layer(bs, cond, low); + cond_ptr += inc; + } + + PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(bs, bs); + + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE348864_CLEAN_store8(r + i * 8, bs[i]); + } +} + +/* input: condition bits c */ +/* output: support s */ +void PQCLEAN_MCELIECE348864_CLEAN_support_gen(gf *s, const unsigned char *c) { + gf a; + int i, j; + unsigned char L[ GFBITS ][ (1 << GFBITS) / 8 ]; + + for (i = 0; i < GFBITS; i++) { + for (j = 0; j < (1 << GFBITS) / 8; j++) { + L[i][j] = 0; + } + } + + for (i = 0; i < (1 << GFBITS); i++) { + a = PQCLEAN_MCELIECE348864_CLEAN_bitrev((gf) i); + + for (j = 0; j < GFBITS; j++) { + L[j][ i / 8 ] |= ((a >> j) & 1) << (i % 8); + } + } + + for (j = 0; j < GFBITS; j++) { + PQCLEAN_MCELIECE348864_CLEAN_apply_benes(L[j], c, 0); + } + + for (i = 0; i < SYS_N; i++) { + s[i] = 0; + for (j = GFBITS - 1; j >= 0; j--) { + s[i] <<= 1; + s[i] |= (L[j][i / 8] >> (i % 8)) & 1; + } + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/benes.h b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/benes.h new file mode 100644 index 0000000000..29fc274075 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_BENES_H +#define PQCLEAN_MCELIECE348864_CLEAN_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "gf.h" + +void PQCLEAN_MCELIECE348864_CLEAN_apply_benes(unsigned char * /*r*/, const unsigned char * /*bits*/, int /*rev*/); +void PQCLEAN_MCELIECE348864_CLEAN_support_gen(gf * /*s*/, const unsigned char * /*c*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/bm.c b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/bm.c new file mode 100644 index 0000000000..89b8ed4a74 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/bm.c @@ -0,0 +1,83 @@ +/* + This file is for the Berlekamp-Massey algorithm + see http://crypto.stanford.edu/~mironov/cs359/massey.pdf +*/ +#include "bm.h" + +#include "params.h" + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +/* the Berlekamp-Massey algorithm */ +/* input: s, sequence of field elements */ +/* output: out, minimal polynomial of s */ +void PQCLEAN_MCELIECE348864_CLEAN_bm(gf *out, gf *s) { + int i; + + uint16_t N = 0; + uint16_t L = 0; + uint16_t mle; + uint16_t mne; + + gf T[ SYS_T + 1 ]; + gf C[ SYS_T + 1 ]; + gf B[ SYS_T + 1 ]; + + gf b = 1, d, f; + + // + + for (i = 0; i < SYS_T + 1; i++) { + C[i] = B[i] = 0; + } + + B[1] = C[0] = 1; + + // + + for (N = 0; N < 2 * SYS_T; N++) { + d = 0; + + for (i = 0; i <= min(N, SYS_T); i++) { + d ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(C[i], s[ N - i]); + } + + mne = d; + mne -= 1; + mne >>= 15; + mne -= 1; + mle = N; + mle -= 2 * L; + mle >>= 15; + mle -= 1; + mle &= mne; + + for (i = 0; i <= SYS_T; i++) { + T[i] = C[i]; + } + + f = PQCLEAN_MCELIECE348864_CLEAN_gf_frac(b, d); + + for (i = 0; i <= SYS_T; i++) { + C[i] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(f, B[i]) & mne; + } + + L = (L & ~mle) | ((N + 1 - L) & mle); + + for (i = 0; i <= SYS_T; i++) { + B[i] = (B[i] & ~mle) | (T[i] & mle); + } + + b = (b & ~mle) | (d & mle); + + for (i = SYS_T; i >= 1; i--) { + B[i] = B[i - 1]; + } + B[0] = 0; + } + + for (i = 0; i <= SYS_T; i++) { + out[i] = C[ SYS_T - i ]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/bm.h b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/bm.h new file mode 100644 index 0000000000..c7da4878e3 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/bm.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_BM_H +#define PQCLEAN_MCELIECE348864_CLEAN_BM_H +/* + This file is for the Berlekamp-Massey algorithm + see http://crypto.stanford.edu/~mironov/cs359/massey.pdf +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE348864_CLEAN_bm(gf * /*out*/, gf * /*s*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/controlbits.c b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/controlbits.c new file mode 100644 index 0000000000..7b3444e34f --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE348864_CLEAN_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE348864_CLEAN_sort_63b(n / 2, x); + PQCLEAN_MCELIECE348864_CLEAN_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE348864_CLEAN_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/controlbits.h b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/controlbits.h new file mode 100644 index 0000000000..3125a8629c --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_CONTROLBITS_H +#define PQCLEAN_MCELIECE348864_CLEAN_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE348864_CLEAN_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE348864_CLEAN_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/crypto_hash.h b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/crypto_hash.h new file mode 100644 index 0000000000..110ecfc9c7 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE348864_CLEAN_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/decrypt.c b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/decrypt.c new file mode 100644 index 0000000000..d180c5cdb3 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/decrypt.c @@ -0,0 +1,90 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "gf.h" +#include "params.h" +#include "root.h" +#include "synd.h" +#include "util.h" + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE348864_CLEAN_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i, w = 0; + uint16_t check; + + unsigned char r[ SYS_N / 8 ]; + + gf g[ SYS_T + 1 ]; + gf L[ SYS_N ]; + + gf s[ SYS_T * 2 ]; + gf s_cmp[ SYS_T * 2 ]; + gf locator[ SYS_T + 1 ]; + gf images[ SYS_N ]; + + gf t; + + // + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = c[i]; + } + for (i = SYND_BYTES; i < SYS_N / 8; i++) { + r[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + g[i] = PQCLEAN_MCELIECE348864_CLEAN_load2(sk); + g[i] &= GFMASK; + sk += 2; + } + g[ SYS_T ] = 1; + + PQCLEAN_MCELIECE348864_CLEAN_support_gen(L, sk); + + PQCLEAN_MCELIECE348864_CLEAN_synd(s, g, L, r); + + PQCLEAN_MCELIECE348864_CLEAN_bm(locator, s); + + PQCLEAN_MCELIECE348864_CLEAN_root(images, locator, L); + + // + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = 0; + } + + for (i = 0; i < SYS_N; i++) { + t = PQCLEAN_MCELIECE348864_CLEAN_gf_iszero(images[i]) & 1; + + e[ i / 8 ] |= t << (i % 8); + w += t; + + } + + PQCLEAN_MCELIECE348864_CLEAN_synd(s_cmp, g, L, e); + + // + + check = (uint16_t)w; + check ^= SYS_T; + + for (i = 0; i < SYS_T * 2; i++) { + check |= s[i] ^ s_cmp[i]; + } + + check -= 1; + check >>= 15; + + return check ^ 1; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/decrypt.h b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/decrypt.h new file mode 100644 index 0000000000..4a80e068e7 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_DECRYPT_H +#define PQCLEAN_MCELIECE348864_CLEAN_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE348864_CLEAN_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/encrypt.c b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/encrypt.c new file mode 100644 index 0000000000..27a6ea4f07 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/encrypt.c @@ -0,0 +1,138 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include + +#include "gf.h" + +static inline uint8_t same_mask(uint16_t x, uint16_t y) { + uint32_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 31; + mask = -mask; + + return (uint8_t)mask; +} + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind_[ SYS_T * 2 ]; + uint8_t *ind_8 = (uint8_t *)ind_; + uint16_t ind[ SYS_T * 2 ]; + uint8_t mask; + unsigned char val[ SYS_T ]; + + while (1) { + randombytes(ind_8, sizeof(ind_)); + // Copy to uint16_t ind_ in a little-endian way + for (i = 0; i < sizeof(ind_); i += 2) { + ind_[i / 2] = ((uint16_t)ind_8[i + 1]) << 8 | (uint16_t)ind_8[i]; + } + + for (i = 0; i < SYS_T * 2; i++) { + ind_[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind_[i] < SYS_N) { + ind[ count++ ] = ind_[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind[i] == ind[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = 1 << (ind[j] & 7); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = same_mask((uint16_t)i, (ind[j] >> 3)); + + e[i] |= val[j] & mask; + } + } +} + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +static void syndrome(unsigned char *s, const unsigned char *pk, const unsigned char *e) { + unsigned char b, row[SYS_N / 8]; + const unsigned char *pk_ptr = pk; + + int i, j; + + for (i = 0; i < SYND_BYTES; i++) { + s[i] = 0; + } + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < SYS_N / 8; j++) { + row[j] = 0; + } + + for (j = 0; j < PK_ROW_BYTES; j++) { + row[ SYS_N / 8 - PK_ROW_BYTES + j ] = pk_ptr[j]; + } + + row[i / 8] |= 1 << (i % 8); + + b = 0; + for (j = 0; j < SYS_N / 8; j++) { + b ^= row[j] & e[j]; + } + + b ^= b >> 4; + b ^= b >> 2; + b ^= b >> 1; + b &= 1; + + s[ i / 8 ] |= (b << (i % 8)); + + pk_ptr += PK_ROW_BYTES; + } +} + +void PQCLEAN_MCELIECE348864_CLEAN_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + + syndrome(s, pk, e); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/encrypt.h b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/encrypt.h new file mode 100644 index 0000000000..2b6daf8683 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_ENCRYPT_H +#define PQCLEAN_MCELIECE348864_CLEAN_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE348864_CLEAN_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/gf.c b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/gf.c new file mode 100644 index 0000000000..d974bf607a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/gf.c @@ -0,0 +1,139 @@ +/* + This file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +gf PQCLEAN_MCELIECE348864_CLEAN_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +gf PQCLEAN_MCELIECE348864_CLEAN_gf_add(gf in0, gf in1) { + return in0 ^ in1; +} + +gf PQCLEAN_MCELIECE348864_CLEAN_gf_mul(gf in0, gf in1) { + int i; + + uint32_t tmp; + uint32_t t0; + uint32_t t1; + uint32_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & (1 << i))); + } + + t = tmp & 0x7FC000; + tmp ^= t >> 9; + tmp ^= t >> 12; + + t = tmp & 0x3000; + tmp ^= t >> 9; + tmp ^= t >> 12; + + return tmp & ((1 << GFBITS) - 1); +} + +/* input: field element in */ +/* return: in^2 */ +static inline gf gf_sq(gf in) { + const uint32_t B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF}; + + uint32_t x = in; + uint32_t t; + + x = (x | (x << 8)) & B[3]; + x = (x | (x << 4)) & B[2]; + x = (x | (x << 2)) & B[1]; + x = (x | (x << 1)) & B[0]; + + t = x & 0x7FC000; + x ^= t >> 9; + x ^= t >> 12; + + t = x & 0x3000; + x ^= t >> 9; + x ^= t >> 12; + + return x & ((1 << GFBITS) - 1); +} + +gf PQCLEAN_MCELIECE348864_CLEAN_gf_inv(gf in) { + gf tmp_11; + gf tmp_1111; + + gf out = in; + + out = gf_sq(out); + tmp_11 = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(out, in); // 11 + + out = gf_sq(tmp_11); + out = gf_sq(out); + tmp_1111 = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(out, tmp_11); // 1111 + + out = gf_sq(tmp_1111); + out = gf_sq(out); + out = gf_sq(out); + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(out, tmp_1111); // 11111111 + + out = gf_sq(out); + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(out, tmp_11); // 1111111111 + + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(out, in); // 11111111111 + + return gf_sq(out); // 111111111110 +} + +/* input: field element den, num */ +/* return: (num/den) */ +gf PQCLEAN_MCELIECE348864_CLEAN_gf_frac(gf den, gf num) { + return PQCLEAN_MCELIECE348864_CLEAN_gf_mul(PQCLEAN_MCELIECE348864_CLEAN_gf_inv(den), num); +} + +/* input: in0, in1 in GF((2^m)^t)*/ +/* output: out = in0*in1 */ +void PQCLEAN_MCELIECE348864_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[ SYS_T * 2 - 1 ]; + + for (i = 0; i < SYS_T * 2 - 1; i++) { + prod[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_T; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = (SYS_T - 1) * 2; i >= SYS_T; i--) { + prod[i - SYS_T + 9] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(prod[i], (gf) 877); + prod[i - SYS_T + 7] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(prod[i], (gf) 2888); + prod[i - SYS_T + 5] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(prod[i], (gf) 1781); + prod[i - SYS_T + 0] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(prod[i], (gf) 373); + } + + for (i = 0; i < SYS_T; i++) { + out[i] = prod[i]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/gf.h b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/gf.h new file mode 100644 index 0000000000..c445925a6c --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_GF_H +#define PQCLEAN_MCELIECE348864_CLEAN_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE348864_CLEAN_gf_iszero(gf a); +gf PQCLEAN_MCELIECE348864_CLEAN_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE348864_CLEAN_gf_mul(gf in0, gf in1); +gf PQCLEAN_MCELIECE348864_CLEAN_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE348864_CLEAN_gf_inv(gf in); +uint64_t PQCLEAN_MCELIECE348864_CLEAN_gf_mul2(gf a, gf b0, gf b1); + +void PQCLEAN_MCELIECE348864_CLEAN_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/operations.c b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/operations.c new file mode 100644 index 0000000000..3a222d77ea --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE348864_CLEAN_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE348864_CLEAN_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE348864_CLEAN_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE348864_CLEAN_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE348864_CLEAN_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE348864_CLEAN_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE348864_CLEAN_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE348864_CLEAN_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE348864_CLEAN_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE348864_CLEAN_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE348864_CLEAN_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE348864_CLEAN_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE348864_CLEAN_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/params.h b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/params.h new file mode 100644 index 0000000000..ae8aaa9179 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_PARAMS_H +#define PQCLEAN_MCELIECE348864_CLEAN_PARAMS_H + +#define GFBITS 12 +#define SYS_N 3488 +#define SYS_T 64 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/pk_gen.c b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/pk_gen.c new file mode 100644 index 0000000000..eec02be001 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/pk_gen.c @@ -0,0 +1,144 @@ +/* + This file is for public-key generation +*/ + +#include + +#include "benes.h" +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "pk_gen.h" +#include "root.h" +#include "util.h" + +/* input: secret key sk */ +/* output: public key pk */ +int PQCLEAN_MCELIECE348864_CLEAN_pk_gen(uint8_t *pk, uint32_t *perm, const uint8_t *sk) { + int i, j, k; + int row, c; + + uint64_t buf[ 1 << GFBITS ]; + + uint8_t mat[ GFBITS * SYS_T ][ SYS_N / 8 ]; + uint8_t mask; + uint8_t b; + + gf g[ SYS_T + 1 ]; // Goppa polynomial + gf L[ SYS_N ]; // support + gf inv[ SYS_N ]; + + // + + g[ SYS_T ] = 1; + + for (i = 0; i < SYS_T; i++) { + g[i] = PQCLEAN_MCELIECE348864_CLEAN_load2(sk); + g[i] &= GFMASK; + sk += 2; + } + + for (i = 0; i < (1 << GFBITS); i++) { + buf[i] = perm[i]; + buf[i] <<= 31; + buf[i] |= i; + } + + PQCLEAN_MCELIECE348864_CLEAN_sort_63b(1 << GFBITS, buf); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = buf[i] & GFMASK; + } + for (i = 0; i < SYS_N; i++) { + L[i] = PQCLEAN_MCELIECE348864_CLEAN_bitrev((gf)perm[i]); + } + + // filling the matrix + + PQCLEAN_MCELIECE348864_CLEAN_root(inv, g, L); + + for (i = 0; i < SYS_N; i++) { + inv[i] = PQCLEAN_MCELIECE348864_CLEAN_gf_inv(inv[i]); + } + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < SYS_N / 8; j++) { + mat[i][j] = 0; + } + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_N; j += 8) { + for (k = 0; k < GFBITS; k++) { + b = (inv[j + 7] >> k) & 1; + b <<= 1; + b |= (inv[j + 6] >> k) & 1; + b <<= 1; + b |= (inv[j + 5] >> k) & 1; + b <<= 1; + b |= (inv[j + 4] >> k) & 1; + b <<= 1; + b |= (inv[j + 3] >> k) & 1; + b <<= 1; + b |= (inv[j + 2] >> k) & 1; + b <<= 1; + b |= (inv[j + 1] >> k) & 1; + b <<= 1; + b |= (inv[j + 0] >> k) & 1; + + mat[ i * GFBITS + k ][ j / 8 ] = b; + } + } + + for (j = 0; j < SYS_N; j++) { + inv[j] = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(inv[j], L[j]); + } + + } + + // gaussian elimination + + for (i = 0; i < (GFBITS * SYS_T + 7) / 8; i++) { + for (j = 0; j < 8; j++) { + row = i * 8 + j; + + if (row >= GFBITS * SYS_T) { + break; + } + + for (k = row + 1; k < GFBITS * SYS_T; k++) { + mask = mat[ row ][ i ] ^ mat[ k ][ i ]; + mask >>= j; + mask &= 1; + mask = -mask; + + for (c = 0; c < SYS_N / 8; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < GFBITS * SYS_T; k++) { + if (k != row) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < SYS_N / 8; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + } + } + + for (i = 0; i < PK_NROWS; i++) { + memcpy(pk + i * PK_ROW_BYTES, mat[i] + PK_NROWS / 8, PK_ROW_BYTES); + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/pk_gen.h b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/pk_gen.h new file mode 100644 index 0000000000..e92992f550 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/pk_gen.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_PK_GEN_H +#define PQCLEAN_MCELIECE348864_CLEAN_PK_GEN_H +/* + This file is for public-key generation +*/ + + +#include + +int PQCLEAN_MCELIECE348864_CLEAN_pk_gen(uint8_t * /*pk*/, uint32_t * /*perm*/, const uint8_t * /*sk*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/root.c b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/root.c new file mode 100644 index 0000000000..a57f215c34 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/root.c @@ -0,0 +1,33 @@ +/* + This file is for evaluating a polynomial at one or more field elements +*/ +#include "root.h" + +#include "params.h" + +/* input: polynomial f and field element a */ +/* return f(a) */ +gf PQCLEAN_MCELIECE348864_CLEAN_eval(gf *f, gf a) { + int i; + gf r; + + r = f[ SYS_T ]; + + for (i = SYS_T - 1; i >= 0; i--) { + r = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(r, a); + r = PQCLEAN_MCELIECE348864_CLEAN_gf_add(r, f[i]); + } + + return r; +} + +/* input: polynomial f and list of field elements L */ +/* output: out = [ f(a) for a in L ] */ +void PQCLEAN_MCELIECE348864_CLEAN_root(gf *out, gf *f, gf *L) { + int i; + + for (i = 0; i < SYS_N; i++) { + out[i] = PQCLEAN_MCELIECE348864_CLEAN_eval(f, L[i]); + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/root.h b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/root.h new file mode 100644 index 0000000000..6b125234e5 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/root.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_ROOT_H +#define PQCLEAN_MCELIECE348864_CLEAN_ROOT_H +/* + This file is for evaluating a polynomial at one or more field elements +*/ + + +#include "gf.h" + +gf PQCLEAN_MCELIECE348864_CLEAN_eval(gf * /*f*/, gf /*a*/); +void PQCLEAN_MCELIECE348864_CLEAN_root(gf * /*out*/, gf * /*f*/, gf * /*L*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/sk_gen.c b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/sk_gen.c new file mode 100644 index 0000000000..d75075e381 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE348864_CLEAN_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE348864_CLEAN_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE348864_CLEAN_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE348864_CLEAN_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE348864_CLEAN_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE348864_CLEAN_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE348864_CLEAN_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/sk_gen.h b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/sk_gen.h new file mode 100644 index 0000000000..6f1df9afe0 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_SK_GEN_H +#define PQCLEAN_MCELIECE348864_CLEAN_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE348864_CLEAN_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE348864_CLEAN_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/synd.c b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/synd.c new file mode 100644 index 0000000000..d473bb1e16 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/synd.c @@ -0,0 +1,33 @@ +/* + This file is for syndrome computation +*/ + +#include "synd.h" + +#include "params.h" +#include "root.h" + + +/* input: Goppa polynomial f, support L, received word r */ +/* output: out, the syndrome of length 2t */ +void PQCLEAN_MCELIECE348864_CLEAN_synd(gf *out, gf *f, gf *L, const unsigned char *r) { + int i, j; + gf e, e_inv, c; + + for (j = 0; j < 2 * SYS_T; j++) { + out[j] = 0; + } + + for (i = 0; i < SYS_N; i++) { + c = (r[i / 8] >> (i % 8)) & 1; + + e = PQCLEAN_MCELIECE348864_CLEAN_eval(f, L[i]); + e_inv = PQCLEAN_MCELIECE348864_CLEAN_gf_inv(PQCLEAN_MCELIECE348864_CLEAN_gf_mul(e, e)); + + for (j = 0; j < 2 * SYS_T; j++) { + out[j] = PQCLEAN_MCELIECE348864_CLEAN_gf_add(out[j], PQCLEAN_MCELIECE348864_CLEAN_gf_mul(e_inv, c)); + e_inv = PQCLEAN_MCELIECE348864_CLEAN_gf_mul(e_inv, L[i]); + } + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/synd.h b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/synd.h new file mode 100644 index 0000000000..34b61bcd4a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/synd.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_SYND_H +#define PQCLEAN_MCELIECE348864_CLEAN_SYND_H +/* + This file is for syndrome computation +*/ + +#include "gf.h" + +void PQCLEAN_MCELIECE348864_CLEAN_synd(gf * /*out*/, gf * /*f*/, gf * /*L*/, const unsigned char * /*r*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/transpose.c b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/transpose.c new file mode 100644 index 0000000000..cbad4f7b92 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/transpose.c @@ -0,0 +1,42 @@ +/* + This file is for matrix transposition +*/ + +#include "transpose.h" + +#include + +/* input: in, a 64x64 matrix over GF(2) */ +/* output: out, transpose of in */ +void PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/transpose.h b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/transpose.h new file mode 100644 index 0000000000..1bdc673ddd --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/transpose.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_TRANSPOSE_H +#define PQCLEAN_MCELIECE348864_CLEAN_TRANSPOSE_H +/* + This file is for matrix transposition +*/ + + +#include + +void PQCLEAN_MCELIECE348864_CLEAN_transpose_64x64(uint64_t * /*out*/, const uint64_t * /*in*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/util.c b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/util.c new file mode 100644 index 0000000000..75f1bc9ca0 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/util.c @@ -0,0 +1,67 @@ +/* + This file is for loading/storing data in a little-endian fashion +*/ + +#include "util.h" + +#include "params.h" + +void PQCLEAN_MCELIECE348864_CLEAN_store2(unsigned char *dest, gf a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE348864_CLEAN_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE348864_CLEAN_load4(const unsigned char *in) { + int i; + uint32_t ret = in[3]; + + for (i = 2; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +void PQCLEAN_MCELIECE348864_CLEAN_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE348864_CLEAN_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +gf PQCLEAN_MCELIECE348864_CLEAN_bitrev(gf a) { + a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8); + a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4); + a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2); + a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1); + + return a >> 4; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_clean/util.h b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/util.h new file mode 100644 index 0000000000..25b6f96637 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_clean/util.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE348864_CLEAN_UTIL_H +#define PQCLEAN_MCELIECE348864_CLEAN_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "gf.h" +#include + +void PQCLEAN_MCELIECE348864_CLEAN_store2(unsigned char * /*dest*/, gf /*a*/); +uint16_t PQCLEAN_MCELIECE348864_CLEAN_load2(const unsigned char * /*src*/); + +uint32_t PQCLEAN_MCELIECE348864_CLEAN_load4(const unsigned char * /*in*/); + +void PQCLEAN_MCELIECE348864_CLEAN_store8(unsigned char * /*out*/, uint64_t /*in*/); +uint64_t PQCLEAN_MCELIECE348864_CLEAN_load8(const unsigned char * /*in*/); + +gf PQCLEAN_MCELIECE348864_CLEAN_bitrev(gf /*a*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/LICENSE b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/LICENSE new file mode 100644 index 0000000000..eba3e7ced4 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/LICENSE @@ -0,0 +1,16 @@ +Public Domain. + +Authors of Classic McEliece in alphabetical order: + +Daniel J. Bernstein, University of Illinois at Chicago +Tung Chou, Osaka University +Tanja Lange, Technische Universiteit Eindhoven +Ingo von Maurich, self +Rafael Misoczki, Intel Corporation +Ruben Niederhagen, Fraunhofer SIT +Edoardo Persichetti, Florida Atlantic University +Christiane Peters, self +Peter Schwabe, Radboud University +Nicolas Sendrier, Inria +Jakub Szefer, Yale University +Wen Wang, Yale University diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/aes256ctr.c b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/aes256ctr.c new file mode 100644 index 0000000000..788493486d --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/aes256ctr.c @@ -0,0 +1,13 @@ +#include "aes256ctr.h" + +void PQCLEAN_MCELIECE348864_SSE_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES]) { + + aes256ctx state; + aes256_ctr_keyexp(&state, key); + aes256_ctr(out, outlen, nonce, &state); + aes256_ctx_release(&state); +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/aes256ctr.h b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/aes256ctr.h new file mode 100644 index 0000000000..9f62b86d77 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/aes256ctr.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_AES256CTR_H +#define PQCLEAN_MCELIECE348864_SSE_AES256CTR_H + +#include +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE348864_SSE_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/api.h b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/api.h new file mode 100644 index 0000000000..d834750e2c --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_API_H +#define PQCLEAN_MCELIECE348864_SSE_API_H + +#include + +#define PQCLEAN_MCELIECE348864_SSE_CRYPTO_ALGNAME "Classic McEliece 348864" +#define PQCLEAN_MCELIECE348864_SSE_CRYPTO_PUBLICKEYBYTES 261120 +#define PQCLEAN_MCELIECE348864_SSE_CRYPTO_SECRETKEYBYTES 6452 +#define PQCLEAN_MCELIECE348864_SSE_CRYPTO_CIPHERTEXTBYTES 128 +#define PQCLEAN_MCELIECE348864_SSE_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE348864_SSE_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE348864_SSE_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE348864_SSE_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/benes.c b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/benes.c new file mode 100644 index 0000000000..d0bb0f1ab4 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/benes.c @@ -0,0 +1,287 @@ +/* + This file is for Benes network related functions +*/ +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_0(uint64_t *bs, const uint64_t *cond) { + int x; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = bs[ x ] ^ bs[ x + 1 ]; + diff &= *cond++; + bs[ x ] ^= diff; + bs[ x + 1 ] ^= diff; + } +} + +static void layer_1(uint64_t *bs, const uint64_t *cond) { + int x; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = bs[ x + 0 ] ^ bs[ x + 2 ]; + diff &= cond[0]; + bs[ x + 0 ] ^= diff; + bs[ x + 2 ] ^= diff; + + diff = bs[ x + 1 ] ^ bs[ x + 3 ]; + diff &= cond[1]; + bs[ x + 1 ] ^= diff; + bs[ x + 3 ] ^= diff; + + cond += 2; + } +} + +static void layer_2(uint64_t *bs, const uint64_t *cond) { + int x; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = bs[ x + 0 ] ^ bs[ x + 4 ]; + diff &= cond[0]; + bs[ x + 0 ] ^= diff; + bs[ x + 4 ] ^= diff; + + diff = bs[ x + 1 ] ^ bs[ x + 5 ]; + diff &= cond[1]; + bs[ x + 1 ] ^= diff; + bs[ x + 5 ] ^= diff; + + diff = bs[ x + 2 ] ^ bs[ x + 6 ]; + diff &= cond[2]; + bs[ x + 2 ] ^= diff; + bs[ x + 6 ] ^= diff; + + diff = bs[ x + 3 ] ^ bs[ x + 7 ]; + diff &= cond[3]; + bs[ x + 3 ] ^= diff; + bs[ x + 7 ] ^= diff; + + cond += 4; + } +} + +static void layer_3(uint64_t *bs, const uint64_t *cond) { + int x, s; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = bs[ s + 0 ] ^ bs[ s + 8 ]; + diff &= cond[0]; + bs[ s + 0 ] ^= diff; + bs[ s + 8 ] ^= diff; + + diff = bs[ s + 1 ] ^ bs[ s + 9 ]; + diff &= cond[1]; + bs[ s + 1 ] ^= diff; + bs[ s + 9 ] ^= diff; + + diff = bs[ s + 2 ] ^ bs[ s + 10 ]; + diff &= cond[2]; + bs[ s + 2 ] ^= diff; + bs[ s + 10 ] ^= diff; + + diff = bs[ s + 3 ] ^ bs[ s + 11 ]; + diff &= cond[3]; + bs[ s + 3 ] ^= diff; + bs[ s + 11 ] ^= diff; + + cond += 4; + } + } +} + +static void layer_4(uint64_t *bs, const uint64_t *cond) { + int x, s; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = bs[ s + 0 ] ^ bs[ s + 16 ]; + diff &= cond[0]; + bs[ s + 0 ] ^= diff; + bs[ s + 16 ] ^= diff; + + diff = bs[ s + 1 ] ^ bs[ s + 17 ]; + diff &= cond[1]; + bs[ s + 1 ] ^= diff; + bs[ s + 17 ] ^= diff; + + diff = bs[ s + 2 ] ^ bs[ s + 18 ]; + diff &= cond[2]; + bs[ s + 2 ] ^= diff; + bs[ s + 18 ] ^= diff; + + diff = bs[ s + 3 ] ^ bs[ s + 19 ]; + diff &= cond[3]; + bs[ s + 3 ] ^= diff; + bs[ s + 19 ] ^= diff; + + cond += 4; + } + } +} + +static void layer_5(uint64_t *bs, const uint64_t *cond) { + int x, s; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = bs[ s + 0 ] ^ bs[ s + 32 ]; + diff &= cond[0]; + bs[ s + 0 ] ^= diff; + bs[ s + 32 ] ^= diff; + + diff = bs[ s + 1 ] ^ bs[ s + 33 ]; + diff &= cond[1]; + bs[ s + 1 ] ^= diff; + bs[ s + 33 ] ^= diff; + + diff = bs[ s + 2 ] ^ bs[ s + 34 ]; + diff &= cond[2]; + bs[ s + 2 ] ^= diff; + bs[ s + 34 ] ^= diff; + + diff = bs[ s + 3 ] ^ bs[ s + 35 ]; + diff &= cond[3]; + bs[ s + 3 ] ^= diff; + bs[ s + 35 ] ^= diff; + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: out, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE348864_SSE_load_bits(uint64_t out[][32], const unsigned char *bits) { + int i, low, block = 0; + + uint64_t cond[64]; + + // + + for (low = 0; low <= 5; low++) { + for (i = 0; i < 64; i++) { + cond[i] = PQCLEAN_MCELIECE348864_SSE_load4(bits + block * 256 + i * 4); + } + PQCLEAN_MCELIECE348864_SSE_transpose_64x64(cond); + + for (i = 0; i < 32; i++) { + out[ block ][i] = cond[i]; + } + block++; + } + + for (low = 0; low <= 5; low++) { + for (i = 0; i < 32; i++) { + out[ block ][i] = PQCLEAN_MCELIECE348864_SSE_load8(bits + block * 256 + i * 8); + } + block++; + } + + for (low = 4; low >= 0; low--) { + for (i = 0; i < 32; i++) { + out[ block ][i] = PQCLEAN_MCELIECE348864_SSE_load8(bits + block * 256 + i * 8); + } + block++; + } + + for (low = 5; low >= 0; low--) { + for (i = 0; i < 64; i++) { + cond[i] = PQCLEAN_MCELIECE348864_SSE_load4(bits + block * 256 + i * 4); + } + PQCLEAN_MCELIECE348864_SSE_transpose_64x64(cond); + + for (i = 0; i < 32; i++) { + out[ block ][i] = cond[i]; + } + block++; + } +} + +/* input: r, sequence of bits to be permuted */ +/* cond, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE348864_SSE_benes(uint64_t *r, uint64_t cond[][32], int rev) { + int block, inc; + + uint64_t *bs = r; + + // + + if (rev == 0) { + block = 0; + inc = 1; + } else { + block = 22; + inc = -1; + } + + PQCLEAN_MCELIECE348864_SSE_transpose_64x64(bs); + + layer_0(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_5(bs, cond[ block ]); + block += inc; + + PQCLEAN_MCELIECE348864_SSE_transpose_64x64(bs); + + layer_0(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_5(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_0(bs, cond[ block ]); + block += inc; + + PQCLEAN_MCELIECE348864_SSE_transpose_64x64(bs); + + layer_5(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_0(bs, cond[ block ]); + //block += inc; + + PQCLEAN_MCELIECE348864_SSE_transpose_64x64(bs); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/benes.h b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/benes.h new file mode 100644 index 0000000000..267744befa --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/benes.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_BENES_H +#define PQCLEAN_MCELIECE348864_SSE_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "gf.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE348864_SSE_load_bits(uint64_t /*out*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE348864_SSE_benes(uint64_t * /*r*/, uint64_t /*cond*/[][32], int /*rev*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/bm.c b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/bm.c new file mode 100644 index 0000000000..e3257e44c1 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/bm.c @@ -0,0 +1,220 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "util.h" +#include "vec.h" +#include "vec128.h" + +#include +#include + +extern void PQCLEAN_MCELIECE348864_SSE_update_asm(void *, gf, int); +extern gf PQCLEAN_MCELIECE348864_SSE_vec_reduce_asm(uint64_t *); + +static inline uint64_t mask_nonzero(gf a) { + uint64_t ret = a; + + ret -= 1; + ret >>= 63; + ret -= 1; + + return ret; +} + +static inline uint64_t mask_leq(uint16_t a, uint16_t b) { + uint64_t a_tmp = a; + uint64_t b_tmp = b; + uint64_t ret = b_tmp - a_tmp; + + ret >>= 63; + ret -= 1; + + return ret; +} + +static void vec_cmov(uint64_t out[][2], uint64_t mask) { + int i; + + for (i = 0; i < GFBITS; i++) { + out[i][0] = (out[i][0] & ~mask) | (out[i][1] & mask); + } +} + +static inline void interleave(vec128 *in, int idx0, int idx1, vec128 *mask, int b) { + int s = 1 << b; + + vec128 x, y; + + x = PQCLEAN_MCELIECE348864_SSE_vec128_or(PQCLEAN_MCELIECE348864_SSE_vec128_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE348864_SSE_vec128_sll_2x(PQCLEAN_MCELIECE348864_SSE_vec128_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE348864_SSE_vec128_or(PQCLEAN_MCELIECE348864_SSE_vec128_srl_2x(PQCLEAN_MCELIECE348864_SSE_vec128_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE348864_SSE_vec128_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec128 *in) { + int i, k; + + vec128 mask[4][2]; + vec128 buf[16]; + + for (i = 0; i < GFBITS; i++) { + buf[i] = in[i]; + } + for (i = GFBITS; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE348864_SSE_vec128_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE348864_SSE_vec128_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE348864_SSE_vec128_extract(buf[i], 1) >> (k * 16)) & GFMASK; + } + } +} + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE348864_SSE_bm(uint64_t out[ GFBITS ], vec128 in[ GFBITS ]) { + uint16_t i; + uint16_t N, L; + + uint64_t prod[ GFBITS ]; + uint64_t in_tmp[ GFBITS ]; + + uint64_t db[ GFBITS ][ 2 ]; + uint64_t BC_tmp[ GFBITS ][ 2 ]; + uint64_t BC[ GFBITS ][ 2 ]; + + uint64_t mask, t; + + gf d, b, c0 = 1; + + gf coefs[SYS_T * 2]; + + // init + + BC[0][1] = 0; + BC[0][0] = 1; + BC[0][0] <<= 63; + + for (i = 1; i < GFBITS; i++) { + BC[i][0] = BC[i][1] = 0; + } + + b = 1; + L = 0; + + // + + get_coefs(coefs, in); + + for (i = 0; i < GFBITS; i++) { + in_tmp[i] = 0; + } + + for (N = 0; N < SYS_T * 2; N++) { + // computing d + + PQCLEAN_MCELIECE348864_SSE_vec_mul_asm(prod, in_tmp, &BC[0][1], 16); + + PQCLEAN_MCELIECE348864_SSE_update_asm(in_tmp, coefs[N], 8); + + d = PQCLEAN_MCELIECE348864_SSE_vec_reduce_asm(prod); + + t = PQCLEAN_MCELIECE348864_SSE_gf_mul2(c0, coefs[N], b); + + d ^= t & 0xFFFFFFFF; + + // 3 cases + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + db[i][0] = (d >> i) & 1; + db[i][0] = -db[i][0]; + db[i][1] = (b >> i) & 1; + db[i][1] = -db[i][1]; + } + + PQCLEAN_MCELIECE348864_SSE_vec128_mul((vec128 *) BC_tmp, (vec128 *) db, (vec128 *) BC); + + vec_cmov(BC, mask); + + PQCLEAN_MCELIECE348864_SSE_update_asm(BC, mask & c0, 16); + + for (i = 0; i < GFBITS; i++) { + BC[i][1] = BC_tmp[i][0] ^ BC_tmp[i][1]; + } + + c0 = t >> 32; + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + + } + + c0 = PQCLEAN_MCELIECE348864_SSE_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + out[i] = (c0 >> i) & 1; + out[i] = -out[i]; + } + + PQCLEAN_MCELIECE348864_SSE_vec_mul_asm(out, out, &BC[0][1], 16); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/bm.h b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/bm.h new file mode 100644 index 0000000000..9430fe2d26 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/bm.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_BM_H +#define PQCLEAN_MCELIECE348864_SSE_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + + +#include + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE348864_SSE_bm(uint64_t *out, vec128 *in); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/consts.S b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/consts.S new file mode 100644 index 0000000000..ff080b2f98 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/consts.S @@ -0,0 +1,32 @@ +.data + +# not supported on macos +#.section .rodata +.globl PQCLEAN_MCELIECE348864_SSE_MASK0_0 +.globl PQCLEAN_MCELIECE348864_SSE_MASK0_1 +.globl PQCLEAN_MCELIECE348864_SSE_MASK1_0 +.globl PQCLEAN_MCELIECE348864_SSE_MASK1_1 +.globl PQCLEAN_MCELIECE348864_SSE_MASK2_0 +.globl PQCLEAN_MCELIECE348864_SSE_MASK2_1 +.globl PQCLEAN_MCELIECE348864_SSE_MASK3_0 +.globl PQCLEAN_MCELIECE348864_SSE_MASK3_1 +.globl PQCLEAN_MCELIECE348864_SSE_MASK4_0 +.globl PQCLEAN_MCELIECE348864_SSE_MASK4_1 +.globl PQCLEAN_MCELIECE348864_SSE_MASK5_0 +.globl PQCLEAN_MCELIECE348864_SSE_MASK5_1 + +.p2align 4 + +PQCLEAN_MCELIECE348864_SSE_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE348864_SSE_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE348864_SSE_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE348864_SSE_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE348864_SSE_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE348864_SSE_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE348864_SSE_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE348864_SSE_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE348864_SSE_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE348864_SSE_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE348864_SSE_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE348864_SSE_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/consts.inc b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/consts.inc new file mode 100644 index 0000000000..87b50f7305 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/consts.inc @@ -0,0 +1,448 @@ +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0FF00FF00FF00FF0, 0x0FF00FF00FF00FF0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA5555AAAA5555AA, 0xAA5555AAAA5555AA), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x33CC33CC33CC33CC, 0x33CC33CC33CC33CC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5A5A5A5A5A5A5A5A, 0x5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3C3C3C3C3C3C3C3C, 0x3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCC3333CCCC3333CC, 0xCC3333CCCC3333CC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55555555AAAAAAAA, 0x55555555AAAAAAAA), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x33333333CCCCCCCC, 0x33333333CCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FFFF00FF00, 0xFF00FF0000FF00FF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFFFFFF0000, 0x0000FFFFFFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA5555AA55AA, 0x55AA55AAAA55AA55), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55555555AAAAAAAA, 0x55555555AAAAAAAA), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF0000FF00FFFF00, 0xFF0000FF00FFFF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3CC33CC3C33CC33C, 0xC33CC33C3CC33CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0FF00FF00FF00FF0, 0xF00FF00FF00FF00F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC33333333CCCC, 0x3333CCCCCCCC3333), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA5555AA55AA, 0x55AA55AAAA55AA55), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x33CC33CCCC33CC33, 0x33CC33CCCC33CC33), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCC3333CC33CCCC33, 0x33CCCC33CC3333CC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55555555AAAAAAAA, 0x55555555AAAAAAAA), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FFFF00FF0000FF, 0x00FFFF00FF0000FF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3CC33CC3C33CC33C, 0xC33CC33C3CC33CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0FF00FF00FF00FF0, 0xF00FF00FF00FF00F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCCCCCC3333, 0xCCCC33333333CCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x6699669999669966, 0x9966996666996699), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA5A5A5A55A5A5A5A, 0xA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x6699669999669966, 0x9966996666996699), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5A5A5A5AA5A5A5A5, 0x5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x6699669999669966, 0x9966996666996699), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5A5A5A5AA5A5A5A5, 0x5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x6699669999669966, 0x9966996666996699), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA5A5A5A55A5A5A5A, 0xA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/controlbits.c b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/controlbits.c new file mode 100644 index 0000000000..0908baf747 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE348864_SSE_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE348864_SSE_sort_63b(n / 2, x); + PQCLEAN_MCELIECE348864_SSE_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE348864_SSE_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/controlbits.h b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/controlbits.h new file mode 100644 index 0000000000..b32ba7b749 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_CONTROLBITS_H +#define PQCLEAN_MCELIECE348864_SSE_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE348864_SSE_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE348864_SSE_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/crypto_hash.h b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/crypto_hash.h new file mode 100644 index 0000000000..c69e5f3c89 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE348864_SSE_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/decrypt.c b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/decrypt.c new file mode 100644 index 0000000000..653bd0054b --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/decrypt.c @@ -0,0 +1,203 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec128 out[][GFBITS], vec128 inv[][GFBITS], const unsigned char *sk, vec128 *recv) { + int i, j; + + uint64_t irr_int[ GFBITS ]; + vec128 eval[32][ GFBITS ]; + vec128 tmp[ GFBITS ]; + + // + + PQCLEAN_MCELIECE348864_SSE_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE348864_SSE_fft(eval, irr_int); + + for (i = 0; i < 32; i++) { + PQCLEAN_MCELIECE348864_SSE_vec128_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE348864_SSE_vec128_copy(inv[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE348864_SSE_vec128_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE348864_SSE_vec128_inv(tmp, inv[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE348864_SSE_vec128_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE348864_SSE_vec128_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE348864_SSE_vec128_copy(inv[0], tmp); + + // + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE348864_SSE_vec128_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + uint8_t r[ 512 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 512; i++) { + r[i] = 0; + } + + for (i = 0; i < 32; i++) { + recv[i] = PQCLEAN_MCELIECE348864_SSE_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE348864_SSE_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE348864_SSE_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE348864_SSE_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE348864_SSE_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec128 out[][GFBITS], vec128 inv[][GFBITS], vec128 *recv) { + int i, j; + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE348864_SSE_vec128_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 32; i++) { + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE348864_SSE_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE348864_SSE_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u32( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint64_t synd_cmp(vec128 s0[ GFBITS ], vec128 s1[ GFBITS ]) { + int i; + vec128 diff; + + diff = PQCLEAN_MCELIECE348864_SSE_vec128_xor(s0[0], s1[0]); + + for (i = 1; i < GFBITS; i++) { + diff = PQCLEAN_MCELIECE348864_SSE_vec128_or(diff, PQCLEAN_MCELIECE348864_SSE_vec128_xor(s0[i], s1[i])); + } + + return PQCLEAN_MCELIECE348864_SSE_vec128_testz(diff); +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE348864_SSE_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec128 inv[ 32 ][ GFBITS ]; + vec128 scaled[ 32 ][ GFBITS ]; + vec128 eval[ 32 ][ GFBITS ]; + + vec128 error[ 32 ]; + + vec128 s_priv[ GFBITS ]; + vec128 s_priv_cmp[ GFBITS ]; + + uint64_t locator[ GFBITS ]; + + vec128 recv[ 32 ]; + vec128 allone; + + uint64_t bits_int[23][32]; + + // Berlekamp decoder + + preprocess(recv, c); + + PQCLEAN_MCELIECE348864_SSE_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE348864_SSE_benes((uint64_t *) recv, bits_int, 1); + + scaling(scaled, inv, sk, recv); + + PQCLEAN_MCELIECE348864_SSE_fft_tr(s_priv, scaled); + + PQCLEAN_MCELIECE348864_SSE_bm(locator, s_priv); + + PQCLEAN_MCELIECE348864_SSE_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE348864_SSE_vec128_setbits(1); + + for (i = 0; i < 32; i++) { + error[i] = PQCLEAN_MCELIECE348864_SSE_vec128_or_reduce(eval[i]); + error[i] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(error[i], allone); + } + + scaling_inv(scaled, inv, error); + PQCLEAN_MCELIECE348864_SSE_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + PQCLEAN_MCELIECE348864_SSE_benes((uint64_t *) error, bits_int, 0); + + postprocess(e, error); + + check_weight = weight_check(e, error); + + return 1 - (check_synd & check_weight); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/decrypt.h b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/decrypt.h new file mode 100644 index 0000000000..91fa2f4b8d --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_DECRYPT_H +#define PQCLEAN_MCELIECE348864_SSE_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE348864_SSE_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/encrypt.c b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/encrypt.c new file mode 100644 index 0000000000..ca5fa765fd --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/encrypt.c @@ -0,0 +1,99 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "gf.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE348864_SSE_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + size_t i, j; + int eq, count; + + uint16_t ind_[ SYS_T * 2 ]; + uint16_t ind[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((uint8_t *)ind_, sizeof(ind_)); + + for (i = 0; i < SYS_T * 2; i++) { + ind_[i] &= GFMASK; + } + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind_[i] < SYS_N) { + ind[ count++ ] = ind_[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + eq = 0; + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < i; j++) { + if (ind[i] == ind[j]) { + eq = 1; + } + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE348864_SSE_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +void PQCLEAN_MCELIECE348864_SSE_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + PQCLEAN_MCELIECE348864_SSE_syndrome_asm(s, pk, e); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/encrypt.h b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/encrypt.h new file mode 100644 index 0000000000..bdc0782326 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_ENCRYPT_H +#define PQCLEAN_MCELIECE348864_SSE_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE348864_SSE_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/fft.c b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/fft.c new file mode 100644 index 0000000000..c8b1b23f56 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/fft.c @@ -0,0 +1,155 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "vec.h" +#include "vec128.h" + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(uint64_t *in) { + int i, j, k; + + const uint64_t mask[5][2] = { + {0x8888888888888888, 0x4444444444444444}, + {0xC0C0C0C0C0C0C0C0, 0x3030303030303030}, + {0xF000F000F000F000, 0x0F000F000F000F00}, + {0xFF000000FF000000, 0x00FF000000FF0000}, + {0xFFFF000000000000, 0x0000FFFF00000000} + }; + + const uint64_t s[5][GFBITS] = { +#include "scalars.inc" + }; + + // + + for (j = 0; j <= 4; j++) { + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + in[i] ^= (in[i] & mask[k][0]) >> (1 << k); + in[i] ^= (in[i] & mask[k][1]) >> (1 << k); + } + } + + PQCLEAN_MCELIECE348864_SSE_vec_mul(in, in, s[j]); // scaling + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec128 out[][ GFBITS ], const uint64_t *in) { + int i, j, k, s, b; + + uint64_t t0, t1; + + const vec128 consts[ 32 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 0; + + const uint8_t reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + // boradcast + + vec128 tmp[ GFBITS ]; + vec128 x[ GFBITS ], y[ GFBITS ]; + + for (j = 0; j < 64; j += 4) { + for (i = 0; i < GFBITS; i++) { + t0 = (in[i] >> reversal[j + 0]) & 1; + t0 = -t0; + t1 = (in[i] >> reversal[j + 2]) & 1; + t1 = -t1; + + out[j / 2 + 0][i] = PQCLEAN_MCELIECE348864_SSE_vec128_set2x(t0, t1); + + t0 = (in[i] >> reversal[j + 1]) & 1; + t0 = -t0; + t1 = (in[i] >> reversal[j + 3]) & 1; + t1 = -t1; + + out[j / 2 + 1][i] = PQCLEAN_MCELIECE348864_SSE_vec128_set2x(t0, t1); + } + } + + // + + + for (i = 0; i < 32; i += 2) { + PQCLEAN_MCELIECE348864_SSE_vec128_mul(tmp, out[i + 1], consts[ 0 ]); + + for (b = 0; b < GFBITS; b++) { + out[i + 0][b] ^= tmp[b]; + } + for (b = 0; b < GFBITS; b++) { + out[i + 1][b] ^= out[i + 0][b]; + } + + for (b = 0; b < GFBITS; b++) { + x[b] = PQCLEAN_MCELIECE348864_SSE_vec128_unpack_low(out[i + 0][b], out[i + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + y[b] = PQCLEAN_MCELIECE348864_SSE_vec128_unpack_high(out[i + 0][b], out[i + 1][b]); + } + + for (b = 0; b < GFBITS; b++) { + out[i + 0][b] = x[b]; + } + for (b = 0; b < GFBITS; b++) { + out[i + 1][b] = y[b]; + } + } + + consts_ptr += 1; + + for (i = 0; i <= 4; i++) { + s = 1 << i; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE348864_SSE_vec128_mul(tmp, out[k + s], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] ^= tmp[b]; + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] ^= out[k][b]; + } + } + } + + consts_ptr += s; + } + + // adding the part contributed by x^64 + + vec128 powers[32][GFBITS] = { +#include "powers.inc" + }; + + for (i = 0; i < 32; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] ^= powers[i][b]; + } + } +} + +void PQCLEAN_MCELIECE348864_SSE_fft(vec128 out[][ GFBITS ], uint64_t *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/fft.h b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/fft.h new file mode 100644 index 0000000000..4fecb38b8e --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/fft.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_FFT_H +#define PQCLEAN_MCELIECE348864_SSE_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE348864_SSE_fft(vec128 /*out*/[][GFBITS], uint64_t * /*in*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/fft_tr.c b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/fft_tr.c new file mode 100644 index 0000000000..74c794306a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/fft_tr.c @@ -0,0 +1,312 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" +#include "vec.h" +#include "vec128.h" + +#include + +static void radix_conversions_tr(vec128 in[ GFBITS ]) { + int i, j, k; + + const vec128 mask[10] = { + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x4444444444444444, 0x4444444444444444), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3030303030303030, 0x3030303030303030), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + uint64_t v0, v1; + + // + + for (j = 5; j >= 0; j--) { + + if (j < 5) { + PQCLEAN_MCELIECE348864_SSE_vec128_mul(in, in, s[j]); + } + + for (i = 0; i < GFBITS; i++) { + for (k = j; k <= 4; k++) { + in[i] ^= PQCLEAN_MCELIECE348864_SSE_vec128_sll_2x(in[i] & mask[2 * k + 0], 1 << k); + in[i] ^= PQCLEAN_MCELIECE348864_SSE_vec128_sll_2x(in[i] & mask[2 * k + 1], 1 << k); + } + } + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE348864_SSE_vec128_extract(in[i], 0); + v1 = PQCLEAN_MCELIECE348864_SSE_vec128_extract(in[i], 1); + + v1 ^= v0 >> 32; + v1 ^= v1 << 32; + + in[i] = PQCLEAN_MCELIECE348864_SSE_vec128_set2x(v0, v1); + } + } +} + +static void butterflies_tr(vec128 out[ GFBITS ], vec128 in[][ GFBITS ]) { + int i, j, k, s, b; + + uint64_t t[ GFBITS ]; + uint64_t pre[6][ GFBITS ]; + + uint64_t out64[2][GFBITS]; + + vec128 p2[ 6 ]; + vec128 buf[64]; + vec128 tt[ GFBITS ]; + vec128 x[ GFBITS ], y[ GFBITS ]; + + const vec128 consts[ 32 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 32; + + const uint8_t reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {8, 1300, 3408, 1354, 2341, 1154}; + + // butterflies + + for (i = 4; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + for (b = 0; b < GFBITS; b++) { + in[k][b] ^= in[k + s][b]; + } + PQCLEAN_MCELIECE348864_SSE_vec128_mul(tt, in[k], consts[ consts_ptr + (k - j) ]); + for (b = 0; b < GFBITS; b++) { + in[k + s][b] ^= tt[b]; + } + } + } + } + + for (i = 0; i < 32; i += 2) { + for (b = 0; b < GFBITS; b++) { + x[b] = PQCLEAN_MCELIECE348864_SSE_vec128_unpack_low(in[i + 0][b], in[i + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + y[b] = PQCLEAN_MCELIECE348864_SSE_vec128_unpack_high(in[i + 0][b], in[i + 1][b]); + } + + for (b = 0; b < GFBITS; b++) { + in[i + 0][b] = x[b] ^ y[b]; + } + PQCLEAN_MCELIECE348864_SSE_vec128_mul(tt, in[i + 0], consts[ 0 ]); + for (b = 0; b < GFBITS; b++) { + in[i + 1][b] = y[b] ^ tt[b]; + } + } + + // transpose + + for (i = 0; i < GFBITS; i += 2) { + for (j = 0; j < 64; j += 4) { + buf[ reversal[j + 0] ] = PQCLEAN_MCELIECE348864_SSE_vec128_set2x(PQCLEAN_MCELIECE348864_SSE_vec128_extract(in[j / 2 + 0][i + 0], 0), + PQCLEAN_MCELIECE348864_SSE_vec128_extract(in[j / 2 + 0][i + 1], 0)); + buf[ reversal[j + 1] ] = PQCLEAN_MCELIECE348864_SSE_vec128_set2x(PQCLEAN_MCELIECE348864_SSE_vec128_extract(in[j / 2 + 1][i + 0], 0), + PQCLEAN_MCELIECE348864_SSE_vec128_extract(in[j / 2 + 1][i + 1], 0)); + buf[ reversal[j + 2] ] = PQCLEAN_MCELIECE348864_SSE_vec128_set2x(PQCLEAN_MCELIECE348864_SSE_vec128_extract(in[j / 2 + 0][i + 0], 1), + PQCLEAN_MCELIECE348864_SSE_vec128_extract(in[j / 2 + 0][i + 1], 1)); + buf[ reversal[j + 3] ] = PQCLEAN_MCELIECE348864_SSE_vec128_set2x(PQCLEAN_MCELIECE348864_SSE_vec128_extract(in[j / 2 + 1][i + 0], 1), + PQCLEAN_MCELIECE348864_SSE_vec128_extract(in[j / 2 + 1][i + 1], 1)); + } + + PQCLEAN_MCELIECE348864_SSE_transpose_64x128_sp(buf); + + p2[0] = buf[32]; + buf[33] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[33], buf[32]); + p2[1] = buf[33]; + buf[35] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[35], buf[33]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[35]); + buf[34] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[34], buf[35]); + p2[2] = buf[34]; + buf[38] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[38], buf[34]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[38]); + buf[39] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[39], buf[38]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[39]); + buf[37] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[37], buf[39]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[37]); + buf[36] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[36], buf[37]); + p2[3] = buf[36]; + buf[44] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[44], buf[36]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[44]); + buf[45] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[45], buf[44]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[45]); + buf[47] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[47], buf[45]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[47]); + buf[46] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[46], buf[47]); + p2[2] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[2], buf[46]); + buf[42] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[42], buf[46]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[42]); + buf[43] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[43], buf[42]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[43]); + buf[41] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[41], buf[43]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[41]); + buf[40] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[40], buf[41]); + p2[4] = buf[40]; + buf[56] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[56], buf[40]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[56]); + buf[57] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[57], buf[56]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[57]); + buf[59] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[59], buf[57]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[59]); + buf[58] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[58], buf[59]); + p2[2] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[2], buf[58]); + buf[62] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[62], buf[58]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[62]); + buf[63] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[63], buf[62]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[63]); + buf[61] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[61], buf[63]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[61]); + buf[60] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[60], buf[61]); + p2[3] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[3], buf[60]); + buf[52] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[52], buf[60]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[52]); + buf[53] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[53], buf[52]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[53]); + buf[55] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[55], buf[53]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[55]); + buf[54] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[54], buf[55]); + p2[2] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[2], buf[54]); + buf[50] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[50], buf[54]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[50]); + buf[51] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[51], buf[50]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[51]); + buf[49] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[49], buf[51]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[49]); + buf[48] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[48], buf[49]); + p2[5] = buf[48]; + buf[16] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[16], buf[48]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[16]); + buf[17] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[17], buf[16]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[17]); + buf[19] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[19], buf[17]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[19]); + buf[18] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[18], buf[19]); + p2[2] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[2], buf[18]); + buf[22] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[22], buf[18]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[22]); + buf[23] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[23], buf[22]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[23]); + buf[21] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[21], buf[23]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[21]); + buf[20] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[20], buf[21]); + p2[3] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[3], buf[20]); + buf[28] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[28], buf[20]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[28]); + buf[29] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[29], buf[28]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[29]); + buf[31] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[31], buf[29]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[31]); + buf[30] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[30], buf[31]); + p2[2] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[2], buf[30]); + buf[26] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[26], buf[30]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[26]); + buf[27] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[27], buf[26]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[27]); + buf[25] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[25], buf[27]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[25]); + buf[24] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[24], buf[25]); + p2[4] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[4], buf[24]); + buf[8] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[8], buf[24]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[8]); + buf[9] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[9], buf[8]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[9]); + buf[11] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[11], buf[9]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[11]); + buf[10] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[10], buf[11]); + p2[2] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[2], buf[10]); + buf[14] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[14], buf[10]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[14]); + buf[15] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[15], buf[14]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[15]); + buf[13] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[13], buf[15]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[13]); + buf[12] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[12], buf[13]); + p2[3] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[3], buf[12]); + buf[4] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[4], buf[12]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[4]); + buf[5] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[5], buf[4]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[5]); + buf[7] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[7], buf[5]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[7]); + buf[6] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[6], buf[7]); + p2[2] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[2], buf[6]); + buf[2] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[2], buf[6]); + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[2]); + buf[3] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[3], buf[2]); + p2[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[1], buf[3]); + buf[1] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[1], buf[3]); + + p2[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(p2[0], buf[1]); + buf[0] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(buf[0], buf[1]); + + for (j = 0; j < 6; j++) { + pre[j][i + 0] = PQCLEAN_MCELIECE348864_SSE_vec128_extract(p2[j], 0); + pre[j][i + 1] = PQCLEAN_MCELIECE348864_SSE_vec128_extract(p2[j], 1); + } + + out64[0][i + 0] = PQCLEAN_MCELIECE348864_SSE_vec128_extract(buf[0], 0); + out64[0][i + 1] = PQCLEAN_MCELIECE348864_SSE_vec128_extract(buf[0], 1); + } + + // + + for (j = 0; j < GFBITS; j++) { + t[j] = (beta[0] >> j) & 1; + t[j] = -t[j]; + } + + PQCLEAN_MCELIECE348864_SSE_vec_mul(out64[1], pre[0], t); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + t[j] = (beta[i] >> j) & 1; + t[j] = -t[j]; + } + + PQCLEAN_MCELIECE348864_SSE_vec_mul(t, pre[i], t); + PQCLEAN_MCELIECE348864_SSE_vec_add(out64[1], out64[1], t); + } + + for (b = 0; b < GFBITS; b++) { + out[b] = PQCLEAN_MCELIECE348864_SSE_vec128_set2x(out64[0][b], out64[1][b]); + } +} + +void PQCLEAN_MCELIECE348864_SSE_fft_tr(vec128 out[GFBITS], vec128 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/fft_tr.h b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/fft_tr.h new file mode 100644 index 0000000000..9244564085 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/fft_tr.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_FFT_TR_H +#define PQCLEAN_MCELIECE348864_SSE_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + +#include "params.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE348864_SSE_fft_tr(vec128 /*out*/[GFBITS], vec128 /*in*/[][ GFBITS ]); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/gf.c b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/gf.c new file mode 100644 index 0000000000..fa5f25d6ce --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/gf.c @@ -0,0 +1,169 @@ +/* + This file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +gf PQCLEAN_MCELIECE348864_SSE_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 20; + + return (gf) t; +} + +gf PQCLEAN_MCELIECE348864_SSE_gf_add(gf in0, gf in1) { + return in0 ^ in1; +} + +gf PQCLEAN_MCELIECE348864_SSE_gf_mul(gf in0, gf in1) { + int i; + + uint32_t tmp; + uint32_t t0; + uint32_t t1; + uint32_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & (1 << i))); + } + + t = tmp & 0x7FC000; + tmp ^= t >> 9; + tmp ^= t >> 12; + + t = tmp & 0x3000; + tmp ^= t >> 9; + tmp ^= t >> 12; + + return tmp & ((1 << GFBITS) - 1); +} + +/* input: field element in */ +/* return: in^2 */ +static inline gf gf_sq(gf in) { + const uint32_t B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF}; + + uint32_t x = in; + uint32_t t; + + x = (x | (x << 8)) & B[3]; + x = (x | (x << 4)) & B[2]; + x = (x | (x << 2)) & B[1]; + x = (x | (x << 1)) & B[0]; + + t = x & 0x7FC000; + x ^= t >> 9; + x ^= t >> 12; + + t = x & 0x3000; + x ^= t >> 9; + x ^= t >> 12; + + return x & ((1 << GFBITS) - 1); +} + +gf PQCLEAN_MCELIECE348864_SSE_gf_inv(gf in) { + gf tmp_11; + gf tmp_1111; + + gf out = in; + + out = gf_sq(out); + tmp_11 = PQCLEAN_MCELIECE348864_SSE_gf_mul(out, in); // 11 + + out = gf_sq(tmp_11); + out = gf_sq(out); + tmp_1111 = PQCLEAN_MCELIECE348864_SSE_gf_mul(out, tmp_11); // 1111 + + out = gf_sq(tmp_1111); + out = gf_sq(out); + out = gf_sq(out); + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864_SSE_gf_mul(out, tmp_1111); // 11111111 + + out = gf_sq(out); + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864_SSE_gf_mul(out, tmp_11); // 1111111111 + + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864_SSE_gf_mul(out, in); // 11111111111 + + return gf_sq(out); // 111111111110 +} + +/* input: field element den, num */ +/* return: (num/den) */ +gf PQCLEAN_MCELIECE348864_SSE_gf_frac(gf den, gf num) { + return PQCLEAN_MCELIECE348864_SSE_gf_mul(PQCLEAN_MCELIECE348864_SSE_gf_inv(den), num); +} + +/* input: in0, in1 in GF((2^m)^t)*/ +/* output: out = in0*in1 */ +void PQCLEAN_MCELIECE348864_SSE_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[ SYS_T * 2 - 1 ]; + + for (i = 0; i < SYS_T * 2 - 1; i++) { + prod[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_T; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE348864_SSE_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = (SYS_T - 1) * 2; i >= SYS_T; i--) { + prod[i - SYS_T + 9] ^= PQCLEAN_MCELIECE348864_SSE_gf_mul(prod[i], (gf) 877); + prod[i - SYS_T + 7] ^= PQCLEAN_MCELIECE348864_SSE_gf_mul(prod[i], (gf) 2888); + prod[i - SYS_T + 5] ^= PQCLEAN_MCELIECE348864_SSE_gf_mul(prod[i], (gf) 1781); + prod[i - SYS_T + 0] ^= PQCLEAN_MCELIECE348864_SSE_gf_mul(prod[i], (gf) 373); + } + + for (i = 0; i < SYS_T; i++) { + out[i] = prod[i]; + } +} + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE348864_SSE_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x007FC000007FC000; + tmp ^= (t >> 9) ^ (t >> 12); + + t = tmp & 0x0000300000003000; + tmp ^= (t >> 9) ^ (t >> 12); + + return tmp & 0x00000FFF00000FFF; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/gf.h b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/gf.h new file mode 100644 index 0000000000..8b3254d6d0 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/gf.h @@ -0,0 +1,26 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_GF_H +#define PQCLEAN_MCELIECE348864_SSE_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include "params.h" + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE348864_SSE_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE348864_SSE_gf_add(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE348864_SSE_gf_mul(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE348864_SSE_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE348864_SSE_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE348864_SSE_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE348864_SSE_gf_mul2(gf a, gf b0, gf b1); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/operations.c b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/operations.c new file mode 100644 index 0000000000..fe43ab236e --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE348864_SSE_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE348864_SSE_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE348864_SSE_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE348864_SSE_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE348864_SSE_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE348864_SSE_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE348864_SSE_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE348864_SSE_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE348864_SSE_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE348864_SSE_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE348864_SSE_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE348864_SSE_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE348864_SSE_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/params.h b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/params.h new file mode 100644 index 0000000000..1b23043c2b --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_PARAMS_H +#define PQCLEAN_MCELIECE348864_SSE_PARAMS_H + +#define GFBITS 12 +#define SYS_N 3488 +#define SYS_T 64 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/pk_gen.c b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/pk_gen.c new file mode 100644 index 0000000000..0d0097c337 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/pk_gen.c @@ -0,0 +1,329 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "transpose.h" +#include "util.h" + +#include + +#include + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void de_bitslicing(uint64_t *out, vec128 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE348864_SSE_vec128_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 128 + 0 * 64 + r] <<= 1; + out[i * 128 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE348864_SSE_vec128_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 128 + 1 * 64 + r] <<= 1; + out[i * 128 + 1 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec128 out0[][GFBITS], vec128 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[2] = {0}; + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 2; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 128 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE348864_SSE_vec128_set2x(u[0], u[1]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 2; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 128 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE348864_SSE_vec128_set2x(u[0], u[1]); + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + return (int)_tzcnt_u64(in); +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint64_t mat[][ ((SYS_N + 127) / 128) * 2 ], uint32_t *perm) { + int i, j, k, s, block_idx, row; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 64; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = (mat[ row + i ][ block_idx + 0 ] >> 32) | + (mat[ row + i ][ block_idx + 1 ] << 32); + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask(k, ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = (mat[ i + j ][ block_idx + 0 ] >> 32) | + (mat[ i + j ][ block_idx + 1 ] << 32); + } + + PQCLEAN_MCELIECE348864_SSE_transpose_64x64(buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask(k, ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + PQCLEAN_MCELIECE348864_SSE_transpose_64x64(buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + mat[ i + j ][ block_idx + 0 ] = (mat[ i + j ][ block_idx + 0 ] << 32 >> 32) | (buf[j] << 32); + mat[ i + j ][ block_idx + 1 ] = (mat[ i + j ][ block_idx + 1 ] >> 32 << 32) | (buf[j] >> 32); + } + } + + return 0; +} + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 127) / 128) +#define NBLOCKS_I ((GFBITS * SYS_T + 63) / 64) + +int PQCLEAN_MCELIECE348864_SSE_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 2 ]; + uint64_t ops[ GFBITS * SYS_T ][ NBLOCKS_I ]; + + uint64_t mask; + + uint64_t irr_int[ GFBITS ]; + + vec128 consts[32][ GFBITS ]; + vec128 eval[ 32 ][ GFBITS ]; + vec128 prod[ 32 ][ GFBITS ]; + vec128 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + + // compute the inverses + + PQCLEAN_MCELIECE348864_SSE_irr_load(irr_int, sk); + + PQCLEAN_MCELIECE348864_SSE_fft(eval, irr_int); + + PQCLEAN_MCELIECE348864_SSE_vec128_copy(prod[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE348864_SSE_vec128_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE348864_SSE_vec128_inv(tmp, prod[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE348864_SSE_vec128_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE348864_SSE_vec128_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE348864_SSE_vec128_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE348864_SSE_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE348864_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE348864_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE348864_SSE_vec128_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 2 * j + 0 ] = PQCLEAN_MCELIECE348864_SSE_vec128_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 2 * j + 1 ] = PQCLEAN_MCELIECE348864_SSE_vec128_extract(prod[ j ][ k ], 1); + } + } + } + + // gaussian elimination + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < NBLOCKS_I; j++) { + ops[ i ][ j ] = 0; + } + } + + for (i = 0; i < PK_NROWS; i++) { + ops[ i ][ i / 64 ] = 1; + ops[ i ][ i / 64 ] <<= (i % 64); + } + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < row; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + + for (i = 0; i < GFBITS * SYS_T; i++) { + for (j = NBLOCKS_I; j < NBLOCKS1_H - 1; j++) { + PQCLEAN_MCELIECE348864_SSE_store8(pk, mat[i][j]); + pk += 8; + } + + PQCLEAN_MCELIECE348864_SSE_store_i(pk, mat[i][j], PK_ROW_BYTES % 8); + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/pk_gen.h b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/pk_gen.h new file mode 100644 index 0000000000..e54b9e6f61 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/pk_gen.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_PK_GEN_H +#define PQCLEAN_MCELIECE348864_SSE_PK_GEN_H +/* + This file is for public-key generation +*/ + + +#include "gf.h" + +int PQCLEAN_MCELIECE348864_SSE_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/powers.inc b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/powers.inc new file mode 100644 index 0000000000..8e15bd373e --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/powers.inc @@ -0,0 +1,448 @@ +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, +{ +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), +PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x5555555555555555, 0x5555555555555555), +}, diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/scalars.inc b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/scalars.inc new file mode 100644 index 0000000000..aa8f64b951 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/scalars.inc @@ -0,0 +1,70 @@ +{ + 0XF3CFC030FC30F003, + 0X3FCF0F003C00C00C, + 0X30033CC300C0C03C, + 0XCCFF0F3C0F30F0C0, + 0X0300C03FF303C3F0, + 0X3FFF3C0FF0CCCCC0, + 0XF3FFF0C00F3C3CC0, + 0X3003333FFFC3C000, + 0X0FF30FFFC3FFF300, + 0XFFC0F300F0F0CC00, + 0XC0CFF3FCCC3CFC00, + 0XFC3C03F0F330C000, +}, +{ + 0X000F00000000F00F, + 0X00000F00F00000F0, + 0X0F00000F00000F00, + 0XF00F00F00F000000, + 0X00F00000000000F0, + 0X0000000F00000000, + 0XF00000000F00F000, + 0X00F00F00000F0000, + 0X0000F00000F00F00, + 0X000F00F00F00F000, + 0X00F00F0000000000, + 0X0000000000F00000, +}, +{ + 0X0000FF00FF0000FF, + 0X0000FF000000FF00, + 0XFF0000FF00FF0000, + 0XFFFF0000FF000000, + 0X00FF00FF00FF0000, + 0X0000FFFFFF000000, + 0X00FFFF00FF000000, + 0XFFFFFF0000FF0000, + 0XFFFF00FFFF00FF00, + 0X0000FF0000000000, + 0XFFFFFF00FF000000, + 0X00FF000000000000, +}, +{ + 0X000000000000FFFF, + 0X00000000FFFF0000, + 0X0000000000000000, + 0XFFFF000000000000, + 0X00000000FFFF0000, + 0X0000FFFF00000000, + 0X0000000000000000, + 0X00000000FFFF0000, + 0X0000FFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, +}, +{ + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/scalars_2x.inc b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/scalars_2x.inc new file mode 100644 index 0000000000..8eb780322f --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/scalars_2x.inc @@ -0,0 +1,70 @@ +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xf3cfc030fc30f003, 0x000c03c0c3c0330c), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3fcf0f003c00c00c, 0xf330cffcc00f33c0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x30033cc300c0c03c, 0xccf330f00f3c0333), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xccff0f3c0f30f0c0, 0xff03fff3ff0cf0c0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0300c03ff303c3f0, 0x3cc3fcf00fcc303c), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3fff3c0ff0ccccc0, 0x0f000c0fc30303f3), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xf3fff0c00f3c3cc0, 0xcf0fc3ff333ccf3c), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x3003333fffc3c000, 0x003f3fc3c0ff333f), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0ff30fffc3fff300, 0x3cc3f0f3cf0ff00f), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xffc0f300f0f0cc00, 0xf3f33cc03fc30cc0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xc0cff3fccc3cfc00, 0x3cc330cfc333f33f), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xfc3c03f0f330c000, 0x3cc0303ff3c3fffc), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x000f00000000f00f, 0x0f00f00f00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00000f00f00000f0, 0xf00000000000f000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0f00000f00000f00, 0x00000f00000000f0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xf00f00f00f000000, 0x0f00f00000f00000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00f00000000000f0, 0x000f00000f00f00f), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000f00000000, 0x00f00f00f00f0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xf00000000f00f000, 0x0f00f00000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00f00f00000f0000, 0x000000000f000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000f00000f00f00, 0x00f00000000f00f0), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x000f00f00f00f000, 0x0000f00f00000f00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00f00f0000000000, 0xf00000f00000f00f), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000f00000, 0x00000f00f00f00f0), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000ff00ff0000ff, 0xff00ffffff000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000ff000000ff00, 0xff0000ffff000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xff0000ff00ff0000, 0xffff00ffff000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xffff0000ff000000, 0xff00ffffffffff00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00ff00ff00ff0000, 0x00000000ff00ff00), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000ffffff000000, 0xffffffff00ff0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00ffff00ff000000, 0x00ffffff00ff0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xffffff0000ff0000, 0xffff00ffff00ffff), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xffff00ffff00ff00, 0xffff0000ffffffff), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000ff0000000000, 0xff00000000ff0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xffffff00ff000000, 0x000000ff00ff00ff), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00ff000000000000, 0x00ff00ff00ffff00), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x000000000000ffff, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00000000ffff0000, 0xffff000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xffff000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00000000ffff0000, 0xffff00000000ffff), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000ffff00000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000ffff00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00000000ffff0000, 0xffff00000000ffff), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000ffff00000000, 0x00000000ffff0000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xffff00000000ffff), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x00000000ffff0000), +}, +{ + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x00000000ffffffff, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xffffffff00000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xffffffff00000000, 0x00000000ffffffff), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xffffffff00000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xffffffffffffffff), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0xffffffff00000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xffffffffffffffff), + PQCLEAN_MCELIECE348864_SSE_vec128_set2x(0x0000000000000000, 0xffffffff00000000), +}, diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/sk_gen.c b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/sk_gen.c new file mode 100644 index 0000000000..255182fbee --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE348864_SSE_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE348864_SSE_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE348864_SSE_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE348864_SSE_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE348864_SSE_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE348864_SSE_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE348864_SSE_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE348864_SSE_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/sk_gen.h b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/sk_gen.h new file mode 100644 index 0000000000..ca96519e47 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_SK_GEN_H +#define PQCLEAN_MCELIECE348864_SSE_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE348864_SSE_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE348864_SSE_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/syndrome_asm.S b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/syndrome_asm.S new file mode 100644 index 0000000000..66c9efecc8 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/syndrome_asm.S @@ -0,0 +1,740 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: reg128 pp + +# qhasm: reg128 ee + +# qhasm: reg128 ss + +# qhasm: int64 buf_ptr + +# qhasm: stack128 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864_SSE_syndrome_asm +.global PQCLEAN_MCELIECE348864_SSE_syndrome_asm +_PQCLEAN_MCELIECE348864_SSE_syndrome_asm: +PQCLEAN_MCELIECE348864_SSE_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 260780 +# asm 1: add $260780,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 768 +# asm 1: mov $768,>row=int64#5 +# asm 2: mov $768,>row=%r8 +mov $768,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg128#1 +# asm 2: movdqu 0(ss=%xmm0 +movdqu 0(%rsi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 96 ] +# asm 1: movdqu 96(ee=reg128#2 +# asm 2: movdqu 96(ee=%xmm1 +movdqu 96(%rdx),%xmm1 + +# qhasm: ss &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 16(pp=%xmm1 +movdqu 16(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 112 ] +# asm 1: movdqu 112(ee=reg128#3 +# asm 2: movdqu 112(ee=%xmm2 +movdqu 112(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 32(pp=%xmm1 +movdqu 32(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 128 ] +# asm 1: movdqu 128(ee=reg128#3 +# asm 2: movdqu 128(ee=%xmm2 +movdqu 128(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 48(pp=%xmm1 +movdqu 48(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 144 ] +# asm 1: movdqu 144(ee=reg128#3 +# asm 2: movdqu 144(ee=%xmm2 +movdqu 144(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 64(pp=%xmm1 +movdqu 64(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 160 ] +# asm 1: movdqu 160(ee=reg128#3 +# asm 2: movdqu 160(ee=%xmm2 +movdqu 160(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 80(pp=%xmm1 +movdqu 80(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 176 ] +# asm 1: movdqu 176(ee=reg128#3 +# asm 2: movdqu 176(ee=%xmm2 +movdqu 176(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 96(pp=%xmm1 +movdqu 96(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 192 ] +# asm 1: movdqu 192(ee=reg128#3 +# asm 2: movdqu 192(ee=%xmm2 +movdqu 192(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 112(pp=%xmm1 +movdqu 112(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 208 ] +# asm 1: movdqu 208(ee=reg128#3 +# asm 2: movdqu 208(ee=%xmm2 +movdqu 208(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 128(pp=%xmm1 +movdqu 128(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 224 ] +# asm 1: movdqu 224(ee=reg128#3 +# asm 2: movdqu 224(ee=%xmm2 +movdqu 224(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 144(pp=%xmm1 +movdqu 144(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 240 ] +# asm 1: movdqu 240(ee=reg128#3 +# asm 2: movdqu 240(ee=%xmm2 +movdqu 240(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 160(pp=%xmm1 +movdqu 160(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 256 ] +# asm 1: movdqu 256(ee=reg128#3 +# asm 2: movdqu 256(ee=%xmm2 +movdqu 256(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 176(pp=%xmm1 +movdqu 176(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 272 ] +# asm 1: movdqu 272(ee=reg128#3 +# asm 2: movdqu 272(ee=%xmm2 +movdqu 272(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 192(pp=%xmm1 +movdqu 192(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 288 ] +# asm 1: movdqu 288(ee=reg128#3 +# asm 2: movdqu 288(ee=%xmm2 +movdqu 288(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 208(pp=%xmm1 +movdqu 208(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 304 ] +# asm 1: movdqu 304(ee=reg128#3 +# asm 2: movdqu 304(ee=%xmm2 +movdqu 304(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 224(pp=%xmm1 +movdqu 224(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 320 ] +# asm 1: movdqu 320(ee=reg128#3 +# asm 2: movdqu 320(ee=%xmm2 +movdqu 320(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 240(pp=%xmm1 +movdqu 240(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 336 ] +# asm 1: movdqu 336(ee=reg128#3 +# asm 2: movdqu 336(ee=%xmm2 +movdqu 336(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 256(pp=%xmm1 +movdqu 256(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 352 ] +# asm 1: movdqu 352(ee=reg128#3 +# asm 2: movdqu 352(ee=%xmm2 +movdqu 352(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 272(pp=%xmm1 +movdqu 272(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 368 ] +# asm 1: movdqu 368(ee=reg128#3 +# asm 2: movdqu 368(ee=%xmm2 +movdqu 368(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 288(pp=%xmm1 +movdqu 288(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 384 ] +# asm 1: movdqu 384(ee=reg128#3 +# asm 2: movdqu 384(ee=%xmm2 +movdqu 384(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 304(pp=%xmm1 +movdqu 304(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 400 ] +# asm 1: movdqu 400(ee=reg128#3 +# asm 2: movdqu 400(ee=%xmm2 +movdqu 400(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand pp=reg128#2 +# asm 2: movdqu 320(pp=%xmm1 +movdqu 320(%rsi),%xmm1 + +# qhasm: ee = mem128[ input_2 + 416 ] +# asm 1: movdqu 416(ee=reg128#3 +# asm 2: movdqu 416(ee=%xmm2 +movdqu 416(%rdx),%xmm2 + +# qhasm: pp &= ee +# asm 1: pand buf=stack128#1 +# asm 2: movdqa buf=0(%rsp) +movdqa %xmm0,0(%rsp) + +# qhasm: s = *(uint32 *)(input_1 + 336) +# asm 1: movl 336(s=int64#6d +# asm 2: movl 336(s=%r9d +movl 336(%rsi),%r9d + +# qhasm: e = *(uint32 *)(input_2 + 432) +# asm 1: movl 432(e=int64#7d +# asm 2: movl 432(e=%eax +movl 432(%rdx),%eax + +# qhasm: s &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,ss=reg128#1 +# asm 2: movdqu 0(ss=%xmm0 +movdqu 0(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 0 ] +# asm 1: movdqu 0(ee=reg128#2 +# asm 2: movdqu 0(ee=%xmm1 +movdqu 0(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 16(ss=%xmm0 +movdqu 16(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 16 ] +# asm 1: movdqu 16(ee=reg128#2 +# asm 2: movdqu 16(ee=%xmm1 +movdqu 16(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 32(ss=%xmm0 +movdqu 32(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 32 ] +# asm 1: movdqu 32(ee=reg128#2 +# asm 2: movdqu 32(ee=%xmm1 +movdqu 32(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 48(ss=%xmm0 +movdqu 48(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 48 ] +# asm 1: movdqu 48(ee=reg128#2 +# asm 2: movdqu 48(ee=%xmm1 +movdqu 48(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 64(ss=%xmm0 +movdqu 64(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 64 ] +# asm 1: movdqu 64(ee=reg128#2 +# asm 2: movdqu 64(ee=%xmm1 +movdqu 64(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor ss=reg128#1 +# asm 2: movdqu 80(ss=%xmm0 +movdqu 80(%rdi),%xmm0 + +# qhasm: ee = mem128[ input_2 + 80 ] +# asm 1: movdqu 80(ee=reg128#2 +# asm 2: movdqu 80(ee=%xmm1 +movdqu 80(%rdx),%xmm1 + +# qhasm: ss ^= ee +# asm 1: pxor + +void PQCLEAN_MCELIECE348864_SSE_transpose_64x64(uint64_t *in); +void PQCLEAN_MCELIECE348864_SSE_transpose_64x128_sp(vec128 *in); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/transpose_64x128_sp_asm.S b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/transpose_64x128_sp_asm.S new file mode 100644 index 0000000000..b3aae49097 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/transpose_64x128_sp_asm.S @@ -0,0 +1,8145 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg128 x0 + +# qhasm: reg128 x1 + +# qhasm: reg128 x2 + +# qhasm: reg128 x3 + +# qhasm: reg128 x4 + +# qhasm: reg128 x5 + +# qhasm: reg128 x6 + +# qhasm: reg128 x7 + +# qhasm: reg128 t0 + +# qhasm: reg128 t1 + +# qhasm: reg128 v00 + +# qhasm: reg128 v01 + +# qhasm: reg128 v10 + +# qhasm: reg128 v11 + +# qhasm: reg128 mask0 + +# qhasm: reg128 mask1 + +# qhasm: reg128 mask2 + +# qhasm: reg128 mask3 + +# qhasm: reg128 mask4 + +# qhasm: reg128 mask5 + +# qhasm: enter transpose_64x128_sp_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864_SSE_transpose_64x128_sp_asm +.global PQCLEAN_MCELIECE348864_SSE_transpose_64x128_sp_asm +_PQCLEAN_MCELIECE348864_SSE_transpose_64x128_sp_asm: +PQCLEAN_MCELIECE348864_SSE_transpose_64x128_sp_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: mask0 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK5_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK5_0(%rip),>mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK3_1(%rip),%xmm5 + +# qhasm: r0 = mem64[ input_0 + 0 ] x2 +# asm 1: movddup 0(r0=reg128#7 +# asm 2: movddup 0(r0=%xmm6 +movddup 0(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 64 ] x2 +# asm 1: movddup 64(r1=reg128#8 +# asm 2: movddup 64(r1=%xmm7 +movddup 64(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 128 ] x2 +# asm 1: movddup 128(r2=reg128#9 +# asm 2: movddup 128(r2=%xmm8 +movddup 128(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 192 ] x2 +# asm 1: movddup 192(r3=reg128#10 +# asm 2: movddup 192(r3=%xmm9 +movddup 192(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 256 ] x2 +# asm 1: movddup 256(r4=reg128#11 +# asm 2: movddup 256(r4=%xmm10 +movddup 256(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 320 ] x2 +# asm 1: movddup 320(r5=reg128#12 +# asm 2: movddup 320(r5=%xmm11 +movddup 320(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 384 ] x2 +# asm 1: movddup 384(r6=reg128#13 +# asm 2: movddup 384(r6=%xmm12 +movddup 384(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 448 ] x2 +# asm 1: movddup 448(r7=reg128#14 +# asm 2: movddup 448(r7=%xmm13 +movddup 448(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 0 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 64 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 128 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 192 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 256 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 320 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 384 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 448 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 8(r0=%xmm6 +movddup 8(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 72 ] x2 +# asm 1: movddup 72(r1=reg128#8 +# asm 2: movddup 72(r1=%xmm7 +movddup 72(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 136 ] x2 +# asm 1: movddup 136(r2=reg128#9 +# asm 2: movddup 136(r2=%xmm8 +movddup 136(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 200 ] x2 +# asm 1: movddup 200(r3=reg128#10 +# asm 2: movddup 200(r3=%xmm9 +movddup 200(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 264 ] x2 +# asm 1: movddup 264(r4=reg128#11 +# asm 2: movddup 264(r4=%xmm10 +movddup 264(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 328 ] x2 +# asm 1: movddup 328(r5=reg128#12 +# asm 2: movddup 328(r5=%xmm11 +movddup 328(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 392 ] x2 +# asm 1: movddup 392(r6=reg128#13 +# asm 2: movddup 392(r6=%xmm12 +movddup 392(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 456 ] x2 +# asm 1: movddup 456(r7=reg128#14 +# asm 2: movddup 456(r7=%xmm13 +movddup 456(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 8 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 72 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 136 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 200 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 264 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 328 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 392 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 456 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 16(r0=%xmm6 +movddup 16(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 80 ] x2 +# asm 1: movddup 80(r1=reg128#8 +# asm 2: movddup 80(r1=%xmm7 +movddup 80(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 144 ] x2 +# asm 1: movddup 144(r2=reg128#9 +# asm 2: movddup 144(r2=%xmm8 +movddup 144(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 208 ] x2 +# asm 1: movddup 208(r3=reg128#10 +# asm 2: movddup 208(r3=%xmm9 +movddup 208(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 272 ] x2 +# asm 1: movddup 272(r4=reg128#11 +# asm 2: movddup 272(r4=%xmm10 +movddup 272(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 336 ] x2 +# asm 1: movddup 336(r5=reg128#12 +# asm 2: movddup 336(r5=%xmm11 +movddup 336(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 400 ] x2 +# asm 1: movddup 400(r6=reg128#13 +# asm 2: movddup 400(r6=%xmm12 +movddup 400(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 464 ] x2 +# asm 1: movddup 464(r7=reg128#14 +# asm 2: movddup 464(r7=%xmm13 +movddup 464(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 16 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 80 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 144 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 208 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 272 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 336 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 400 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 464 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 24(r0=%xmm6 +movddup 24(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 88 ] x2 +# asm 1: movddup 88(r1=reg128#8 +# asm 2: movddup 88(r1=%xmm7 +movddup 88(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 152 ] x2 +# asm 1: movddup 152(r2=reg128#9 +# asm 2: movddup 152(r2=%xmm8 +movddup 152(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 216 ] x2 +# asm 1: movddup 216(r3=reg128#10 +# asm 2: movddup 216(r3=%xmm9 +movddup 216(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 280 ] x2 +# asm 1: movddup 280(r4=reg128#11 +# asm 2: movddup 280(r4=%xmm10 +movddup 280(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 344 ] x2 +# asm 1: movddup 344(r5=reg128#12 +# asm 2: movddup 344(r5=%xmm11 +movddup 344(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 408 ] x2 +# asm 1: movddup 408(r6=reg128#13 +# asm 2: movddup 408(r6=%xmm12 +movddup 408(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 472 ] x2 +# asm 1: movddup 472(r7=reg128#14 +# asm 2: movddup 472(r7=%xmm13 +movddup 472(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 24 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 88 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 152 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 216 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 280 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 344 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 408 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 472 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 32(r0=%xmm6 +movddup 32(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 96 ] x2 +# asm 1: movddup 96(r1=reg128#8 +# asm 2: movddup 96(r1=%xmm7 +movddup 96(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 160 ] x2 +# asm 1: movddup 160(r2=reg128#9 +# asm 2: movddup 160(r2=%xmm8 +movddup 160(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 224 ] x2 +# asm 1: movddup 224(r3=reg128#10 +# asm 2: movddup 224(r3=%xmm9 +movddup 224(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 288 ] x2 +# asm 1: movddup 288(r4=reg128#11 +# asm 2: movddup 288(r4=%xmm10 +movddup 288(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 352 ] x2 +# asm 1: movddup 352(r5=reg128#12 +# asm 2: movddup 352(r5=%xmm11 +movddup 352(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 416 ] x2 +# asm 1: movddup 416(r6=reg128#13 +# asm 2: movddup 416(r6=%xmm12 +movddup 416(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 480 ] x2 +# asm 1: movddup 480(r7=reg128#14 +# asm 2: movddup 480(r7=%xmm13 +movddup 480(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 32 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 96 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 160 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 224 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 288 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 352 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 416 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 480 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 40(r0=%xmm6 +movddup 40(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 104 ] x2 +# asm 1: movddup 104(r1=reg128#8 +# asm 2: movddup 104(r1=%xmm7 +movddup 104(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 168 ] x2 +# asm 1: movddup 168(r2=reg128#9 +# asm 2: movddup 168(r2=%xmm8 +movddup 168(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 232 ] x2 +# asm 1: movddup 232(r3=reg128#10 +# asm 2: movddup 232(r3=%xmm9 +movddup 232(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 296 ] x2 +# asm 1: movddup 296(r4=reg128#11 +# asm 2: movddup 296(r4=%xmm10 +movddup 296(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 360 ] x2 +# asm 1: movddup 360(r5=reg128#12 +# asm 2: movddup 360(r5=%xmm11 +movddup 360(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 424 ] x2 +# asm 1: movddup 424(r6=reg128#13 +# asm 2: movddup 424(r6=%xmm12 +movddup 424(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 488 ] x2 +# asm 1: movddup 488(r7=reg128#14 +# asm 2: movddup 488(r7=%xmm13 +movddup 488(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 40 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 104 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 168 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 232 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 296 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 360 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 424 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 488 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 48(r0=%xmm6 +movddup 48(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 112 ] x2 +# asm 1: movddup 112(r1=reg128#8 +# asm 2: movddup 112(r1=%xmm7 +movddup 112(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 176 ] x2 +# asm 1: movddup 176(r2=reg128#9 +# asm 2: movddup 176(r2=%xmm8 +movddup 176(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 240 ] x2 +# asm 1: movddup 240(r3=reg128#10 +# asm 2: movddup 240(r3=%xmm9 +movddup 240(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 304 ] x2 +# asm 1: movddup 304(r4=reg128#11 +# asm 2: movddup 304(r4=%xmm10 +movddup 304(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 368 ] x2 +# asm 1: movddup 368(r5=reg128#12 +# asm 2: movddup 368(r5=%xmm11 +movddup 368(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 432 ] x2 +# asm 1: movddup 432(r6=reg128#13 +# asm 2: movddup 432(r6=%xmm12 +movddup 432(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 496 ] x2 +# asm 1: movddup 496(r7=reg128#14 +# asm 2: movddup 496(r7=%xmm13 +movddup 496(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 48 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 112 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 176 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 240 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 304 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 368 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 432 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 496 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 56(r0=%xmm6 +movddup 56(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 120 ] x2 +# asm 1: movddup 120(r1=reg128#8 +# asm 2: movddup 120(r1=%xmm7 +movddup 120(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 184 ] x2 +# asm 1: movddup 184(r2=reg128#9 +# asm 2: movddup 184(r2=%xmm8 +movddup 184(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 248 ] x2 +# asm 1: movddup 248(r3=reg128#10 +# asm 2: movddup 248(r3=%xmm9 +movddup 248(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 312 ] x2 +# asm 1: movddup 312(r4=reg128#11 +# asm 2: movddup 312(r4=%xmm10 +movddup 312(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 376 ] x2 +# asm 1: movddup 376(r5=reg128#12 +# asm 2: movddup 376(r5=%xmm11 +movddup 376(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 440 ] x2 +# asm 1: movddup 440(r6=reg128#13 +# asm 2: movddup 440(r6=%xmm12 +movddup 440(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 504 ] x2 +# asm 1: movddup 504(r7=reg128#14 +# asm 2: movddup 504(r7=%xmm13 +movddup 504(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#13 +# asm 2: vpor r1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#11 +# asm 2: vpor r4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#4 +# asm 2: vpor r0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#8 +# asm 2: vpor r1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#9 +# asm 2: vpor r2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#10 +# asm 2: vpor r4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#5 +# asm 2: vpor r6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm3,%rsi + +# qhasm: mem64[ input_0 + 56 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm7,%rsi + +# qhasm: mem64[ input_0 + 120 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 184 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm0,%rsi + +# qhasm: mem64[ input_0 + 248 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 312 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm2,%rsi + +# qhasm: mem64[ input_0 + 376 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm4,%rsi + +# qhasm: mem64[ input_0 + 440 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm1,%rsi + +# qhasm: mem64[ input_0 + 504 ] = buf +# asm 1: movq mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE348864_SSE_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864_SSE_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE348864_SSE_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE348864_SSE_MASK0_1(%rip),%xmm5 + +# qhasm: r0 = mem64[ input_0 + 0 ] x2 +# asm 1: movddup 0(r0=reg128#7 +# asm 2: movddup 0(r0=%xmm6 +movddup 0(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 8 ] x2 +# asm 1: movddup 8(r1=reg128#8 +# asm 2: movddup 8(r1=%xmm7 +movddup 8(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 16 ] x2 +# asm 1: movddup 16(r2=reg128#9 +# asm 2: movddup 16(r2=%xmm8 +movddup 16(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 24 ] x2 +# asm 1: movddup 24(r3=reg128#10 +# asm 2: movddup 24(r3=%xmm9 +movddup 24(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 32 ] x2 +# asm 1: movddup 32(r4=reg128#11 +# asm 2: movddup 32(r4=%xmm10 +movddup 32(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 40 ] x2 +# asm 1: movddup 40(r5=reg128#12 +# asm 2: movddup 40(r5=%xmm11 +movddup 40(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 48 ] x2 +# asm 1: movddup 48(r6=reg128#13 +# asm 2: movddup 48(r6=%xmm12 +movddup 48(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 56 ] x2 +# asm 1: movddup 56(r7=reg128#14 +# asm 2: movddup 56(r7=%xmm13 +movddup 56(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 0 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 16 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 32 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 64(r0=%xmm6 +movddup 64(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 72 ] x2 +# asm 1: movddup 72(r1=reg128#8 +# asm 2: movddup 72(r1=%xmm7 +movddup 72(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 80 ] x2 +# asm 1: movddup 80(r2=reg128#9 +# asm 2: movddup 80(r2=%xmm8 +movddup 80(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 88 ] x2 +# asm 1: movddup 88(r3=reg128#10 +# asm 2: movddup 88(r3=%xmm9 +movddup 88(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 96 ] x2 +# asm 1: movddup 96(r4=reg128#11 +# asm 2: movddup 96(r4=%xmm10 +movddup 96(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 104 ] x2 +# asm 1: movddup 104(r5=reg128#12 +# asm 2: movddup 104(r5=%xmm11 +movddup 104(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 112 ] x2 +# asm 1: movddup 112(r6=reg128#13 +# asm 2: movddup 112(r6=%xmm12 +movddup 112(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 120 ] x2 +# asm 1: movddup 120(r7=reg128#14 +# asm 2: movddup 120(r7=%xmm13 +movddup 120(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 64 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 80 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 96 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 112 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 128(r0=%xmm6 +movddup 128(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 136 ] x2 +# asm 1: movddup 136(r1=reg128#8 +# asm 2: movddup 136(r1=%xmm7 +movddup 136(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 144 ] x2 +# asm 1: movddup 144(r2=reg128#9 +# asm 2: movddup 144(r2=%xmm8 +movddup 144(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 152 ] x2 +# asm 1: movddup 152(r3=reg128#10 +# asm 2: movddup 152(r3=%xmm9 +movddup 152(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 160 ] x2 +# asm 1: movddup 160(r4=reg128#11 +# asm 2: movddup 160(r4=%xmm10 +movddup 160(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 168 ] x2 +# asm 1: movddup 168(r5=reg128#12 +# asm 2: movddup 168(r5=%xmm11 +movddup 168(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 176 ] x2 +# asm 1: movddup 176(r6=reg128#13 +# asm 2: movddup 176(r6=%xmm12 +movddup 176(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 184 ] x2 +# asm 1: movddup 184(r7=reg128#14 +# asm 2: movddup 184(r7=%xmm13 +movddup 184(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 128 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 144 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 160 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 176 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 192(r0=%xmm6 +movddup 192(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 200 ] x2 +# asm 1: movddup 200(r1=reg128#8 +# asm 2: movddup 200(r1=%xmm7 +movddup 200(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 208 ] x2 +# asm 1: movddup 208(r2=reg128#9 +# asm 2: movddup 208(r2=%xmm8 +movddup 208(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 216 ] x2 +# asm 1: movddup 216(r3=reg128#10 +# asm 2: movddup 216(r3=%xmm9 +movddup 216(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 224 ] x2 +# asm 1: movddup 224(r4=reg128#11 +# asm 2: movddup 224(r4=%xmm10 +movddup 224(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 232 ] x2 +# asm 1: movddup 232(r5=reg128#12 +# asm 2: movddup 232(r5=%xmm11 +movddup 232(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 240 ] x2 +# asm 1: movddup 240(r6=reg128#13 +# asm 2: movddup 240(r6=%xmm12 +movddup 240(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 248 ] x2 +# asm 1: movddup 248(r7=reg128#14 +# asm 2: movddup 248(r7=%xmm13 +movddup 248(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 192 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 208 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 224 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 240 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 256(r0=%xmm6 +movddup 256(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 264 ] x2 +# asm 1: movddup 264(r1=reg128#8 +# asm 2: movddup 264(r1=%xmm7 +movddup 264(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 272 ] x2 +# asm 1: movddup 272(r2=reg128#9 +# asm 2: movddup 272(r2=%xmm8 +movddup 272(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 280 ] x2 +# asm 1: movddup 280(r3=reg128#10 +# asm 2: movddup 280(r3=%xmm9 +movddup 280(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 288 ] x2 +# asm 1: movddup 288(r4=reg128#11 +# asm 2: movddup 288(r4=%xmm10 +movddup 288(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 296 ] x2 +# asm 1: movddup 296(r5=reg128#12 +# asm 2: movddup 296(r5=%xmm11 +movddup 296(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 304 ] x2 +# asm 1: movddup 304(r6=reg128#13 +# asm 2: movddup 304(r6=%xmm12 +movddup 304(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 312 ] x2 +# asm 1: movddup 312(r7=reg128#14 +# asm 2: movddup 312(r7=%xmm13 +movddup 312(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 256 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 272 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 288 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 304 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 320(r0=%xmm6 +movddup 320(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 328 ] x2 +# asm 1: movddup 328(r1=reg128#8 +# asm 2: movddup 328(r1=%xmm7 +movddup 328(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 336 ] x2 +# asm 1: movddup 336(r2=reg128#9 +# asm 2: movddup 336(r2=%xmm8 +movddup 336(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 344 ] x2 +# asm 1: movddup 344(r3=reg128#10 +# asm 2: movddup 344(r3=%xmm9 +movddup 344(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 352 ] x2 +# asm 1: movddup 352(r4=reg128#11 +# asm 2: movddup 352(r4=%xmm10 +movddup 352(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 360 ] x2 +# asm 1: movddup 360(r5=reg128#12 +# asm 2: movddup 360(r5=%xmm11 +movddup 360(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 368 ] x2 +# asm 1: movddup 368(r6=reg128#13 +# asm 2: movddup 368(r6=%xmm12 +movddup 368(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 376 ] x2 +# asm 1: movddup 376(r7=reg128#14 +# asm 2: movddup 376(r7=%xmm13 +movddup 376(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 320 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 336 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 352 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 368 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 384(r0=%xmm6 +movddup 384(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 392 ] x2 +# asm 1: movddup 392(r1=reg128#8 +# asm 2: movddup 392(r1=%xmm7 +movddup 392(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 400 ] x2 +# asm 1: movddup 400(r2=reg128#9 +# asm 2: movddup 400(r2=%xmm8 +movddup 400(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 408 ] x2 +# asm 1: movddup 408(r3=reg128#10 +# asm 2: movddup 408(r3=%xmm9 +movddup 408(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 416 ] x2 +# asm 1: movddup 416(r4=reg128#11 +# asm 2: movddup 416(r4=%xmm10 +movddup 416(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 424 ] x2 +# asm 1: movddup 424(r5=reg128#12 +# asm 2: movddup 424(r5=%xmm11 +movddup 424(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 432 ] x2 +# asm 1: movddup 432(r6=reg128#13 +# asm 2: movddup 432(r6=%xmm12 +movddup 432(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 440 ] x2 +# asm 1: movddup 440(r7=reg128#14 +# asm 2: movddup 440(r7=%xmm13 +movddup 440(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 384 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 400 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 416 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 432 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 448(r0=%xmm6 +movddup 448(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 456 ] x2 +# asm 1: movddup 456(r1=reg128#8 +# asm 2: movddup 456(r1=%xmm7 +movddup 456(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 464 ] x2 +# asm 1: movddup 464(r2=reg128#9 +# asm 2: movddup 464(r2=%xmm8 +movddup 464(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 472 ] x2 +# asm 1: movddup 472(r3=reg128#10 +# asm 2: movddup 472(r3=%xmm9 +movddup 472(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 480 ] x2 +# asm 1: movddup 480(r4=reg128#11 +# asm 2: movddup 480(r4=%xmm10 +movddup 480(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 488 ] x2 +# asm 1: movddup 488(r5=reg128#12 +# asm 2: movddup 488(r5=%xmm11 +movddup 488(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 496 ] x2 +# asm 1: movddup 496(r6=reg128#13 +# asm 2: movddup 496(r6=%xmm12 +movddup 496(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 504 ] x2 +# asm 1: movddup 504(r7=reg128#14 +# asm 2: movddup 504(r7=%xmm13 +movddup 504(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#13 +# asm 2: vpor r1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#11 +# asm 2: vpor r4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#4 +# asm 2: vpor r0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#8 +# asm 2: vpor r1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#9 +# asm 2: vpor r2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#10 +# asm 2: vpor r4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#5 +# asm 2: vpor r6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#4 +# asm 2: vpunpcklqdq t0=%xmm3 +vpunpcklqdq %xmm7,%xmm3,%xmm3 + +# qhasm: mem128[ input_0 + 448 ] = t0 +# asm 1: movdqu t0=reg128#1 +# asm 2: vpunpcklqdq t0=%xmm0 +vpunpcklqdq %xmm0,%xmm8,%xmm0 + +# qhasm: mem128[ input_0 + 464 ] = t0 +# asm 1: movdqu t0=reg128#1 +# asm 2: vpunpcklqdq t0=%xmm0 +vpunpcklqdq %xmm2,%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 480 ] = t0 +# asm 1: movdqu t0=reg128#1 +# asm 2: vpunpcklqdq t0=%xmm0 +vpunpcklqdq %xmm1,%xmm4,%xmm0 + +# qhasm: mem128[ input_0 + 496 ] = t0 +# asm 1: movdqu s1=int64#2 +# asm 2: mov s1=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE348864_SSE_store2(unsigned char *dest, gf a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE348864_SSE_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE348864_SSE_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE348864_SSE_irr_load(uint64_t *out, const unsigned char *in) { + int i, j; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE348864_SSE_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + out[i] = 0; + } + + for (i = SYS_T; i >= 0; i--) { + for (j = 0; j < GFBITS; j++) { + out[j] <<= 1; + out[j] |= (irr[i] >> j) & 1; + } + } +} + +void PQCLEAN_MCELIECE348864_SSE_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE348864_SSE_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +gf PQCLEAN_MCELIECE348864_SSE_bitrev(gf a) { + a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8); + a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4); + a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2); + a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1); + + return a >> 4; +} + +vec128 PQCLEAN_MCELIECE348864_SSE_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE348864_SSE_vec128_set2x( PQCLEAN_MCELIECE348864_SSE_load8(in), PQCLEAN_MCELIECE348864_SSE_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE348864_SSE_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE348864_SSE_store8(out + 0, PQCLEAN_MCELIECE348864_SSE_vec128_extract(in, 0)); + PQCLEAN_MCELIECE348864_SSE_store8(out + 8, PQCLEAN_MCELIECE348864_SSE_vec128_extract(in, 1)); +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/util.h b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/util.h new file mode 100644 index 0000000000..97491b39ae --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/util.h @@ -0,0 +1,33 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_UTIL_H +#define PQCLEAN_MCELIECE348864_SSE_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "gf.h" +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE348864_SSE_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE348864_SSE_store2(unsigned char *dest, gf a); + +uint16_t PQCLEAN_MCELIECE348864_SSE_load2(const unsigned char *src); + +uint32_t PQCLEAN_MCELIECE348864_SSE_load4(const unsigned char *src); + +void PQCLEAN_MCELIECE348864_SSE_irr_load(uint64_t *out, const unsigned char *in); + +void PQCLEAN_MCELIECE348864_SSE_store8(unsigned char *out, uint64_t in); + +uint64_t PQCLEAN_MCELIECE348864_SSE_load8(const unsigned char *in); + +gf PQCLEAN_MCELIECE348864_SSE_bitrev(gf a); + +vec128 PQCLEAN_MCELIECE348864_SSE_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE348864_SSE_store16(unsigned char *out, vec128 in); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec.c b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec.c new file mode 100644 index 0000000000..82e40b26fa --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec.c @@ -0,0 +1,17 @@ + +#include "vec.h" + +#include "params.h" + +void PQCLEAN_MCELIECE348864_SSE_vec_mul(uint64_t *h, const uint64_t *f, const uint64_t *g) { + PQCLEAN_MCELIECE348864_SSE_vec_mul_asm(h, f, g, 8); +} + +void PQCLEAN_MCELIECE348864_SSE_vec_add(uint64_t *h, const uint64_t *f, const uint64_t *g) { + int b; + + for (b = 0; b < GFBITS; b++) { + h[b] = f[b] ^ g[b]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec.h b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec.h new file mode 100644 index 0000000000..d33258e0d9 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_VEC_H +#define PQCLEAN_MCELIECE348864_SSE_VEC_H + +#include + +extern void PQCLEAN_MCELIECE348864_SSE_vec_mul_asm(uint64_t *, const uint64_t *, const uint64_t *, int); + +void PQCLEAN_MCELIECE348864_SSE_vec_mul(uint64_t *h, const uint64_t *f, const uint64_t *g); +void PQCLEAN_MCELIECE348864_SSE_vec_add(uint64_t *h, const uint64_t *f, const uint64_t *g); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec128.c b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec128.c new file mode 100644 index 0000000000..219cb19cca --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec128.c @@ -0,0 +1,143 @@ +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + +#include "vec128.h" + +#include "params.h" + +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +int PQCLEAN_MCELIECE348864_SSE_vec128_testz(vec128 a) { + return _mm_testz_si128(a, a); +} + +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE348864_SSE_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE348864_SSE_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE348864_SSE_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE348864_SSE_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE348864_SSE_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE348864_SSE_vec128_mul_asm(h, f, g, 16); +} + +/* bitsliced field squarings */ +void PQCLEAN_MCELIECE348864_SSE_vec128_sq(vec128 *out, const vec128 *in) { + int i; + vec128 result[GFBITS]; + + result[0] = in[0] ^ in[6]; + result[1] = in[11]; + result[2] = in[1] ^ in[7]; + result[3] = in[6]; + result[4] = in[2] ^ in[11] ^ in[8]; + result[5] = in[7]; + result[6] = in[3] ^ in[9]; + result[7] = in[8]; + result[8] = in[4] ^ in[10]; + result[9] = in[9]; + result[10] = in[5] ^ in[11]; + result[11] = in[10]; + + for (i = 0; i < GFBITS; i++) { + out[i] = result[i]; + } +} + +/* bitsliced field inverses */ +void PQCLEAN_MCELIECE348864_SSE_vec128_inv(vec128 *out, const vec128 *in) { + vec128 tmp_11[ GFBITS ]; + vec128 tmp_1111[ GFBITS ]; + + PQCLEAN_MCELIECE348864_SSE_vec128_copy(out, in); + + PQCLEAN_MCELIECE348864_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE348864_SSE_vec128_mul(tmp_11, out, in); // 11 + + PQCLEAN_MCELIECE348864_SSE_vec128_sq(out, tmp_11); + PQCLEAN_MCELIECE348864_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE348864_SSE_vec128_mul(tmp_1111, out, tmp_11); // 1111 + + PQCLEAN_MCELIECE348864_SSE_vec128_sq(out, tmp_1111); + PQCLEAN_MCELIECE348864_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE348864_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE348864_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE348864_SSE_vec128_mul(out, out, tmp_1111); // 11111111 + + PQCLEAN_MCELIECE348864_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE348864_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE348864_SSE_vec128_mul(out, out, tmp_11); // 1111111111 + + PQCLEAN_MCELIECE348864_SSE_vec128_sq(out, out); + PQCLEAN_MCELIECE348864_SSE_vec128_mul(out, out, in); // 11111111111 + + PQCLEAN_MCELIECE348864_SSE_vec128_sq(out, out); // 111111111110 +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec128.h b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec128.h new file mode 100644 index 0000000000..e002e77f9e --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec128.h @@ -0,0 +1,42 @@ +#ifndef PQCLEAN_MCELIECE348864_SSE_VEC128_H +#define PQCLEAN_MCELIECE348864_SSE_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE348864_SSE_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE348864_SSE_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE348864_SSE_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE348864_SSE_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE348864_SSE_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE348864_SSE_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE348864_SSE_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +void PQCLEAN_MCELIECE348864_SSE_vec128_sq(vec128 * /*out*/, const vec128 * /*in*/); +void PQCLEAN_MCELIECE348864_SSE_vec128_inv(vec128 * /*out*/, const vec128 * /*in*/); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec128_mul_asm.S b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec128_mul_asm.S new file mode 100644 index 0000000000..f9c2753bc5 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec128_mul_asm.S @@ -0,0 +1,1736 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg128 a0 + +# qhasm: reg128 a1 + +# qhasm: reg128 a2 + +# qhasm: reg128 a3 + +# qhasm: reg128 a4 + +# qhasm: reg128 a5 + +# qhasm: reg128 a6 + +# qhasm: reg128 a7 + +# qhasm: reg128 a8 + +# qhasm: reg128 a9 + +# qhasm: reg128 a10 + +# qhasm: reg128 a11 + +# qhasm: reg128 b0 + +# qhasm: reg128 b1 + +# qhasm: reg128 r0 + +# qhasm: reg128 r1 + +# qhasm: reg128 r2 + +# qhasm: reg128 r3 + +# qhasm: reg128 r4 + +# qhasm: reg128 r5 + +# qhasm: reg128 r6 + +# qhasm: reg128 r7 + +# qhasm: reg128 r8 + +# qhasm: reg128 r9 + +# qhasm: reg128 r10 + +# qhasm: reg128 r11 + +# qhasm: reg128 r12 + +# qhasm: reg128 r13 + +# qhasm: reg128 r14 + +# qhasm: reg128 r15 + +# qhasm: reg128 r16 + +# qhasm: reg128 r17 + +# qhasm: reg128 r18 + +# qhasm: reg128 r19 + +# qhasm: reg128 r20 + +# qhasm: reg128 r21 + +# qhasm: reg128 r22 + +# qhasm: reg128 r + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864_SSE_vec128_mul_asm +.global PQCLEAN_MCELIECE348864_SSE_vec128_mul_asm +_PQCLEAN_MCELIECE348864_SSE_vec128_mul_asm: +PQCLEAN_MCELIECE348864_SSE_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem128[ input_2 + 0 ] +# asm 1: movdqu 0(b0=reg128#1 +# asm 2: movdqu 0(b0=%xmm0 +movdqu 0(%rdx),%xmm0 + +# qhasm: a11 = mem128[ input_1 + 176 ] +# asm 1: movdqu 176(a11=reg128#2 +# asm 2: movdqu 176(a11=%xmm1 +movdqu 176(%rsi),%xmm1 + +# qhasm: r11 = a11 & b0 +# asm 1: vpand r11=reg128#3 +# asm 2: vpand r11=%xmm2 +vpand %xmm0,%xmm1,%xmm2 + +# qhasm: r12 = a11 & mem128[input_2 + 16] +# asm 1: vpand 16(r12=reg128#4 +# asm 2: vpand 16(r12=%xmm3 +vpand 16(%rdx),%xmm1,%xmm3 + +# qhasm: r13 = a11 & mem128[input_2 + 32] +# asm 1: vpand 32(r13=reg128#5 +# asm 2: vpand 32(r13=%xmm4 +vpand 32(%rdx),%xmm1,%xmm4 + +# qhasm: r14 = a11 & mem128[input_2 + 48] +# asm 1: vpand 48(r14=reg128#6 +# asm 2: vpand 48(r14=%xmm5 +vpand 48(%rdx),%xmm1,%xmm5 + +# qhasm: r15 = a11 & mem128[input_2 + 64] +# asm 1: vpand 64(r15=reg128#7 +# asm 2: vpand 64(r15=%xmm6 +vpand 64(%rdx),%xmm1,%xmm6 + +# qhasm: r16 = a11 & mem128[input_2 + 80] +# asm 1: vpand 80(r16=reg128#8 +# asm 2: vpand 80(r16=%xmm7 +vpand 80(%rdx),%xmm1,%xmm7 + +# qhasm: r17 = a11 & mem128[input_2 + 96] +# asm 1: vpand 96(r17=reg128#9 +# asm 2: vpand 96(r17=%xmm8 +vpand 96(%rdx),%xmm1,%xmm8 + +# qhasm: r18 = a11 & mem128[input_2 + 112] +# asm 1: vpand 112(r18=reg128#10 +# asm 2: vpand 112(r18=%xmm9 +vpand 112(%rdx),%xmm1,%xmm9 + +# qhasm: r19 = a11 & mem128[input_2 + 128] +# asm 1: vpand 128(r19=reg128#11 +# asm 2: vpand 128(r19=%xmm10 +vpand 128(%rdx),%xmm1,%xmm10 + +# qhasm: r20 = a11 & mem128[input_2 + 144] +# asm 1: vpand 144(r20=reg128#12 +# asm 2: vpand 144(r20=%xmm11 +vpand 144(%rdx),%xmm1,%xmm11 + +# qhasm: r21 = a11 & mem128[input_2 + 160] +# asm 1: vpand 160(r21=reg128#13 +# asm 2: vpand 160(r21=%xmm12 +vpand 160(%rdx),%xmm1,%xmm12 + +# qhasm: r22 = a11 & mem128[input_2 + 176] +# asm 1: vpand 176(r22=reg128#2 +# asm 2: vpand 176(r22=%xmm1 +vpand 176(%rdx),%xmm1,%xmm1 + +# qhasm: r13 ^= r22 +# asm 1: pxor r10=reg128#2 +# asm 2: movdqa r10=%xmm1 +movdqa %xmm1,%xmm1 + +# qhasm: a10 = mem128[ input_1 + 160 ] +# asm 1: movdqu 160(a10=reg128#14 +# asm 2: movdqu 160(a10=%xmm13 +movdqu 160(%rsi),%xmm13 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r20 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r21 ^= r +# asm 1: pxor r9=reg128#13 +# asm 2: movdqa r9=%xmm12 +movdqa %xmm12,%xmm12 + +# qhasm: a9 = mem128[ input_1 + 144 ] +# asm 1: movdqu 144(a9=reg128#14 +# asm 2: movdqu 144(a9=%xmm13 +movdqu 144(%rsi),%xmm13 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r19 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r20 ^= r +# asm 1: pxor r8=reg128#12 +# asm 2: movdqa r8=%xmm11 +movdqa %xmm11,%xmm11 + +# qhasm: a8 = mem128[ input_1 + 128 ] +# asm 1: movdqu 128(a8=reg128#14 +# asm 2: movdqu 128(a8=%xmm13 +movdqu 128(%rsi),%xmm13 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r18 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r19 ^= r +# asm 1: pxor r7=reg128#11 +# asm 2: movdqa r7=%xmm10 +movdqa %xmm10,%xmm10 + +# qhasm: a7 = mem128[ input_1 + 112 ] +# asm 1: movdqu 112(a7=reg128#14 +# asm 2: movdqu 112(a7=%xmm13 +movdqu 112(%rsi),%xmm13 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r17 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r18 ^= r +# asm 1: pxor r6=reg128#10 +# asm 2: movdqa r6=%xmm9 +movdqa %xmm9,%xmm9 + +# qhasm: a6 = mem128[ input_1 + 96 ] +# asm 1: movdqu 96(a6=reg128#14 +# asm 2: movdqu 96(a6=%xmm13 +movdqu 96(%rsi),%xmm13 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r16 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r17 ^= r +# asm 1: pxor r5=reg128#9 +# asm 2: movdqa r5=%xmm8 +movdqa %xmm8,%xmm8 + +# qhasm: a5 = mem128[ input_1 + 80 ] +# asm 1: movdqu 80(a5=reg128#14 +# asm 2: movdqu 80(a5=%xmm13 +movdqu 80(%rsi),%xmm13 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r16 ^= r +# asm 1: pxor r4=reg128#8 +# asm 2: movdqa r4=%xmm7 +movdqa %xmm7,%xmm7 + +# qhasm: a4 = mem128[ input_1 + 64 ] +# asm 1: movdqu 64(a4=reg128#14 +# asm 2: movdqu 64(a4=%xmm13 +movdqu 64(%rsi),%xmm13 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r15 ^= r +# asm 1: pxor r3=reg128#7 +# asm 2: movdqa r3=%xmm6 +movdqa %xmm6,%xmm6 + +# qhasm: a3 = mem128[ input_1 + 48 ] +# asm 1: movdqu 48(a3=reg128#14 +# asm 2: movdqu 48(a3=%xmm13 +movdqu 48(%rsi),%xmm13 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r14 ^= r +# asm 1: pxor r2=reg128#6 +# asm 2: movdqa r2=%xmm5 +movdqa %xmm5,%xmm5 + +# qhasm: a2 = mem128[ input_1 + 32 ] +# asm 1: movdqu 32(a2=reg128#14 +# asm 2: movdqu 32(a2=%xmm13 +movdqu 32(%rsi),%xmm13 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r13 ^= r +# asm 1: pxor r1=reg128#5 +# asm 2: movdqa r1=%xmm4 +movdqa %xmm4,%xmm4 + +# qhasm: a1 = mem128[ input_1 + 16 ] +# asm 1: movdqu 16(a1=reg128#14 +# asm 2: movdqu 16(a1=%xmm13 +movdqu 16(%rsi),%xmm13 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg128#15 +# asm 2: vpand r=%xmm14 +vpand %xmm0,%xmm13,%xmm14 + +# qhasm: r1 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 16(r=%xmm14 +vpand 16(%rdx),%xmm13,%xmm14 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 32(r=%xmm14 +vpand 32(%rdx),%xmm13,%xmm14 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 48(r=%xmm14 +vpand 48(%rdx),%xmm13,%xmm14 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 64(r=%xmm14 +vpand 64(%rdx),%xmm13,%xmm14 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 80(r=%xmm14 +vpand 80(%rdx),%xmm13,%xmm14 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 96(r=%xmm14 +vpand 96(%rdx),%xmm13,%xmm14 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 112(r=%xmm14 +vpand 112(%rdx),%xmm13,%xmm14 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 128(r=%xmm14 +vpand 128(%rdx),%xmm13,%xmm14 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 144(r=%xmm14 +vpand 144(%rdx),%xmm13,%xmm14 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#15 +# asm 2: vpand 160(r=%xmm14 +vpand 160(%rdx),%xmm13,%xmm14 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#14 +# asm 2: vpand 176(r=%xmm13 +vpand 176(%rdx),%xmm13,%xmm13 + +# qhasm: r12 ^= r +# asm 1: pxor r0=reg128#4 +# asm 2: movdqa r0=%xmm3 +movdqa %xmm3,%xmm3 + +# qhasm: a0 = mem128[ input_1 + 0 ] +# asm 1: movdqu 0(a0=reg128#14 +# asm 2: movdqu 0(a0=%xmm13 +movdqu 0(%rsi),%xmm13 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: r0 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 16(r=%xmm0 +vpand 16(%rdx),%xmm13,%xmm0 + +# qhasm: r1 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 32(r=%xmm0 +vpand 32(%rdx),%xmm13,%xmm0 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 48(r=%xmm0 +vpand 48(%rdx),%xmm13,%xmm0 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 64(r=%xmm0 +vpand 64(%rdx),%xmm13,%xmm0 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 80(r=%xmm0 +vpand 80(%rdx),%xmm13,%xmm0 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 96(r=%xmm0 +vpand 96(%rdx),%xmm13,%xmm0 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 112(r=%xmm0 +vpand 112(%rdx),%xmm13,%xmm0 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 128(r=%xmm0 +vpand 128(%rdx),%xmm13,%xmm0 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 144(r=%xmm0 +vpand 144(%rdx),%xmm13,%xmm0 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 160(r=%xmm0 +vpand 160(%rdx),%xmm13,%xmm0 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand 176(r=%xmm0 +vpand 176(%rdx),%xmm13,%xmm0 + +# qhasm: r11 ^= r +# asm 1: pxor r11_stack=stack64#1 +# asm 2: movq r11_stack=608(%rsp) +movq %r11,608(%rsp) + +# qhasm: r12_stack = caller_r12 +# asm 1: movq r12_stack=stack64#2 +# asm 2: movq r12_stack=616(%rsp) +movq %r12,616(%rsp) + +# qhasm: r13_stack = caller_r13 +# asm 1: movq r13_stack=stack64#3 +# asm 2: movq r13_stack=624(%rsp) +movq %r13,624(%rsp) + +# qhasm: r14_stack = caller_r14 +# asm 1: movq r14_stack=stack64#4 +# asm 2: movq r14_stack=632(%rsp) +movq %r14,632(%rsp) + +# qhasm: r15_stack = caller_r15 +# asm 1: movq r15_stack=stack64#5 +# asm 2: movq r15_stack=640(%rsp) +movq %r15,640(%rsp) + +# qhasm: rbx_stack = caller_rbx +# asm 1: movq rbx_stack=stack64#6 +# asm 2: movq rbx_stack=648(%rsp) +movq %rbx,648(%rsp) + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#5 +# asm 2: leaq ptr=%r8 +leaq 0(%rsp),%r8 + +# qhasm: tmp = input_3 +# asm 1: mov tmp=int64#6 +# asm 2: mov tmp=%r9 +mov %rcx,%r9 + +# qhasm: tmp *= 11 +# asm 1: imulq $11,tmp=int64#6 +# asm 2: imulq $11,tmp=%r9 +imulq $11,%r9,%r9 + +# qhasm: input_2 += tmp +# asm 1: add b11=reg128#1 +# asm 2: movddup 0(b11=%xmm0 +movddup 0(%rdx),%xmm0 + +# qhasm: input_2 -= input_3 +# asm 1: sub r16=reg128#3 +# asm 2: vpand r16=%xmm2 +vpand %xmm1,%xmm0,%xmm2 + +# qhasm: mem128[ ptr + 256 ] = r16 +# asm 1: movdqu r15=reg128#4 +# asm 2: vpand r15=%xmm3 +vpand %xmm2,%xmm0,%xmm3 + +# qhasm: a3[0] = mem64[ input_1 + 24 ] +# asm 1: pinsrq $0x0,24(r14=reg128#6 +# asm 2: vpand r14=%xmm5 +vpand %xmm4,%xmm0,%xmm5 + +# qhasm: a2[0] = mem64[ input_1 + 16 ] +# asm 1: pinsrq $0x0,16(r13=reg128#8 +# asm 2: vpand r13=%xmm7 +vpand %xmm6,%xmm0,%xmm7 + +# qhasm: a1[0] = mem64[ input_1 + 8 ] +# asm 1: pinsrq $0x0,8(r12=reg128#10 +# asm 2: vpand r12=%xmm9 +vpand %xmm8,%xmm0,%xmm9 + +# qhasm: a0[0] = mem64[ input_1 + 0 ] +# asm 1: pinsrq $0x0,0(r11=reg128#1 +# asm 2: vpand r11=%xmm0 +vpand %xmm10,%xmm0,%xmm0 + +# qhasm: b10 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b10=reg128#12 +# asm 2: movddup 0(b10=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r15 ^= r +# asm 1: pxor r=reg128#4 +# asm 2: vpand r=%xmm3 +vpand %xmm2,%xmm11,%xmm3 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#4 +# asm 2: vpand r=%xmm3 +vpand %xmm4,%xmm11,%xmm3 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#4 +# asm 2: vpand r=%xmm3 +vpand %xmm6,%xmm11,%xmm3 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#4 +# asm 2: vpand r=%xmm3 +vpand %xmm8,%xmm11,%xmm3 + +# qhasm: r11 ^= r +# asm 1: pxor r10=reg128#4 +# asm 2: vpand r10=%xmm3 +vpand %xmm10,%xmm11,%xmm3 + +# qhasm: b9 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b9=reg128#12 +# asm 2: movddup 0(b9=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r14 ^= r +# asm 1: pxor r=reg128#6 +# asm 2: vpand r=%xmm5 +vpand %xmm2,%xmm11,%xmm5 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#6 +# asm 2: vpand r=%xmm5 +vpand %xmm4,%xmm11,%xmm5 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#6 +# asm 2: vpand r=%xmm5 +vpand %xmm6,%xmm11,%xmm5 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#6 +# asm 2: vpand r=%xmm5 +vpand %xmm8,%xmm11,%xmm5 + +# qhasm: r10 ^= r +# asm 1: pxor r9=reg128#6 +# asm 2: vpand r9=%xmm5 +vpand %xmm10,%xmm11,%xmm5 + +# qhasm: b8 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b8=reg128#12 +# asm 2: movddup 0(b8=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r13 ^= r +# asm 1: pxor r=reg128#8 +# asm 2: vpand r=%xmm7 +vpand %xmm2,%xmm11,%xmm7 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#8 +# asm 2: vpand r=%xmm7 +vpand %xmm4,%xmm11,%xmm7 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#8 +# asm 2: vpand r=%xmm7 +vpand %xmm6,%xmm11,%xmm7 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#8 +# asm 2: vpand r=%xmm7 +vpand %xmm8,%xmm11,%xmm7 + +# qhasm: r9 ^= r +# asm 1: pxor r8=reg128#8 +# asm 2: vpand r8=%xmm7 +vpand %xmm10,%xmm11,%xmm7 + +# qhasm: b7 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b7=reg128#12 +# asm 2: movddup 0(b7=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r12 ^= r +# asm 1: pxor r=reg128#10 +# asm 2: vpand r=%xmm9 +vpand %xmm2,%xmm11,%xmm9 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#10 +# asm 2: vpand r=%xmm9 +vpand %xmm4,%xmm11,%xmm9 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#10 +# asm 2: vpand r=%xmm9 +vpand %xmm6,%xmm11,%xmm9 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#10 +# asm 2: vpand r=%xmm9 +vpand %xmm8,%xmm11,%xmm9 + +# qhasm: r8 ^= r +# asm 1: pxor r7=reg128#10 +# asm 2: vpand r7=%xmm9 +vpand %xmm10,%xmm11,%xmm9 + +# qhasm: b6 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b6=reg128#12 +# asm 2: movddup 0(b6=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r11 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm2,%xmm11,%xmm0 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm4,%xmm11,%xmm0 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm6,%xmm11,%xmm0 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm8,%xmm11,%xmm0 + +# qhasm: r7 ^= r +# asm 1: pxor r6=reg128#1 +# asm 2: vpand r6=%xmm0 +vpand %xmm10,%xmm11,%xmm0 + +# qhasm: b5 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b5=reg128#12 +# asm 2: movddup 0(b5=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r10 ^= r +# asm 1: pxor r=reg128#4 +# asm 2: vpand r=%xmm3 +vpand %xmm2,%xmm11,%xmm3 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#4 +# asm 2: vpand r=%xmm3 +vpand %xmm4,%xmm11,%xmm3 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#4 +# asm 2: vpand r=%xmm3 +vpand %xmm6,%xmm11,%xmm3 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#4 +# asm 2: vpand r=%xmm3 +vpand %xmm8,%xmm11,%xmm3 + +# qhasm: r6 ^= r +# asm 1: pxor r5=reg128#4 +# asm 2: vpand r5=%xmm3 +vpand %xmm10,%xmm11,%xmm3 + +# qhasm: b4 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b4=reg128#12 +# asm 2: movddup 0(b4=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r9 ^= r +# asm 1: pxor r=reg128#6 +# asm 2: vpand r=%xmm5 +vpand %xmm2,%xmm11,%xmm5 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#6 +# asm 2: vpand r=%xmm5 +vpand %xmm4,%xmm11,%xmm5 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#6 +# asm 2: vpand r=%xmm5 +vpand %xmm6,%xmm11,%xmm5 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#6 +# asm 2: vpand r=%xmm5 +vpand %xmm8,%xmm11,%xmm5 + +# qhasm: r5 ^= r +# asm 1: pxor r4=reg128#6 +# asm 2: vpand r4=%xmm5 +vpand %xmm10,%xmm11,%xmm5 + +# qhasm: b3 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b3=reg128#12 +# asm 2: movddup 0(b3=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r8 ^= r +# asm 1: pxor r=reg128#8 +# asm 2: vpand r=%xmm7 +vpand %xmm2,%xmm11,%xmm7 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#8 +# asm 2: vpand r=%xmm7 +vpand %xmm4,%xmm11,%xmm7 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#8 +# asm 2: vpand r=%xmm7 +vpand %xmm6,%xmm11,%xmm7 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#8 +# asm 2: vpand r=%xmm7 +vpand %xmm8,%xmm11,%xmm7 + +# qhasm: r4 ^= r +# asm 1: pxor r3=reg128#8 +# asm 2: vpand r3=%xmm7 +vpand %xmm10,%xmm11,%xmm7 + +# qhasm: b2 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b2=reg128#12 +# asm 2: movddup 0(b2=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r7 ^= r +# asm 1: pxor r=reg128#10 +# asm 2: vpand r=%xmm9 +vpand %xmm2,%xmm11,%xmm9 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#10 +# asm 2: vpand r=%xmm9 +vpand %xmm4,%xmm11,%xmm9 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#10 +# asm 2: vpand r=%xmm9 +vpand %xmm6,%xmm11,%xmm9 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#10 +# asm 2: vpand r=%xmm9 +vpand %xmm8,%xmm11,%xmm9 + +# qhasm: r3 ^= r +# asm 1: pxor r2=reg128#10 +# asm 2: vpand r2=%xmm9 +vpand %xmm10,%xmm11,%xmm9 + +# qhasm: b1 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b1=reg128#12 +# asm 2: movddup 0(b1=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#13 +# asm 2: vpand r=%xmm12 +vpand %xmm1,%xmm11,%xmm12 + +# qhasm: r6 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm2,%xmm11,%xmm0 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm4,%xmm11,%xmm0 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm6,%xmm11,%xmm0 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#1 +# asm 2: vpand r=%xmm0 +vpand %xmm8,%xmm11,%xmm0 + +# qhasm: r2 ^= r +# asm 1: pxor r1=reg128#1 +# asm 2: vpand r1=%xmm0 +vpand %xmm10,%xmm11,%xmm0 + +# qhasm: b0 = mem64[ input_2 + 0 ] x2 +# asm 1: movddup 0(b0=reg128#12 +# asm 2: movddup 0(b0=%xmm11 +movddup 0(%rdx),%xmm11 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg128#2 +# asm 2: vpand r=%xmm1 +vpand %xmm1,%xmm11,%xmm1 + +# qhasm: r5 ^= r +# asm 1: pxor r=reg128#2 +# asm 2: vpand r=%xmm1 +vpand %xmm2,%xmm11,%xmm1 + +# qhasm: r4 ^= r +# asm 1: pxor r=reg128#2 +# asm 2: vpand r=%xmm1 +vpand %xmm4,%xmm11,%xmm1 + +# qhasm: r3 ^= r +# asm 1: pxor r=reg128#2 +# asm 2: vpand r=%xmm1 +vpand %xmm6,%xmm11,%xmm1 + +# qhasm: r2 ^= r +# asm 1: pxor r=reg128#2 +# asm 2: vpand r=%xmm1 +vpand %xmm8,%xmm11,%xmm1 + +# qhasm: r1 ^= r +# asm 1: pxor r0=reg128#2 +# asm 2: vpand r0=%xmm1 +vpand %xmm10,%xmm11,%xmm1 + +# qhasm: mem128[ ptr + 64 ] = r4 +# asm 1: movdqu h22=int64#2 +# asm 2: movq 264(h22=%rsi +movq 264(%r8),%rsi + +# qhasm: h13 = h22 +# asm 1: mov h13=int64#3 +# asm 2: mov h13=%rdx +mov %rsi,%rdx + +# qhasm: h10 = h22 +# asm 1: mov h10=int64#2 +# asm 2: mov h10=%rsi +mov %rsi,%rsi + +# qhasm: h21 = mem64[ ptr + 248 ] +# asm 1: movq 248(h21=int64#4 +# asm 2: movq 248(h21=%rcx +movq 248(%r8),%rcx + +# qhasm: h12 = h21 +# asm 1: mov h12=int64#6 +# asm 2: mov h12=%r9 +mov %rcx,%r9 + +# qhasm: h9 = h21 +# asm 1: mov h9=int64#4 +# asm 2: mov h9=%rcx +mov %rcx,%rcx + +# qhasm: h20 = mem64[ ptr + 232 ] +# asm 1: movq 232(h20=int64#7 +# asm 2: movq 232(h20=%rax +movq 232(%r8),%rax + +# qhasm: h11 = h20 +# asm 1: mov h11=int64#8 +# asm 2: mov h11=%r10 +mov %rax,%r10 + +# qhasm: h8 = h20 +# asm 1: mov h8=int64#7 +# asm 2: mov h8=%rax +mov %rax,%rax + +# qhasm: h19 = mem64[ ptr + 216 ] +# asm 1: movq 216(h19=int64#9 +# asm 2: movq 216(h19=%r11 +movq 216(%r8),%r11 + +# qhasm: h10 ^= h19 +# asm 1: xor h7=int64#9 +# asm 2: mov h7=%r11 +mov %r11,%r11 + +# qhasm: h18 = mem64[ ptr + 200 ] +# asm 1: movq 200(h18=int64#10 +# asm 2: movq 200(h18=%r12 +movq 200(%r8),%r12 + +# qhasm: h9 ^= h18 +# asm 1: xor h6=int64#10 +# asm 2: mov h6=%r12 +mov %r12,%r12 + +# qhasm: h17 = mem64[ ptr + 184 ] +# asm 1: movq 184(h17=int64#11 +# asm 2: movq 184(h17=%r13 +movq 184(%r8),%r13 + +# qhasm: h8 ^= h17 +# asm 1: xor h5=int64#11 +# asm 2: mov h5=%r13 +mov %r13,%r13 + +# qhasm: h16 = mem64[ ptr + 168 ] +# asm 1: movq 168(h16=int64#12 +# asm 2: movq 168(h16=%r14 +movq 168(%r8),%r14 + +# qhasm: h16 ^= *(uint64 *) ( ptr + 256 ) +# asm 1: xorq 256(h4=int64#12 +# asm 2: mov h4=%r14 +mov %r14,%r14 + +# qhasm: h15 = mem64[ ptr + 152 ] +# asm 1: movq 152(h15=int64#13 +# asm 2: movq 152(h15=%r15 +movq 152(%r8),%r15 + +# qhasm: h15 ^= *(uint64 *) ( ptr + 240 ) +# asm 1: xorq 240(h3=int64#13 +# asm 2: mov h3=%r15 +mov %r15,%r15 + +# qhasm: h14 = mem64[ ptr + 136 ] +# asm 1: movq 136(h14=int64#14 +# asm 2: movq 136(h14=%rbx +movq 136(%r8),%rbx + +# qhasm: h14 ^= *(uint64 *) ( ptr + 224 ) +# asm 1: xorq 224(h2=int64#14 +# asm 2: mov h2=%rbx +mov %rbx,%rbx + +# qhasm: h13 ^= *(uint64 *) ( ptr + 120 ) +# asm 1: xorq 120(h1=int64#3 +# asm 2: mov h1=%rdx +mov %rdx,%rdx + +# qhasm: h12 ^= *(uint64 *) ( ptr + 104 ) +# asm 1: xorq 104(h0=int64#6 +# asm 2: mov h0=%r9 +mov %r9,%r9 + +# qhasm: h11 ^= *(uint64 *) ( ptr + 176 ) +# asm 1: xorq 176(caller_r11=int64#9 +# asm 2: movq caller_r11=%r11 +movq 608(%rsp),%r11 + +# qhasm: caller_r12 = r12_stack +# asm 1: movq caller_r12=int64#10 +# asm 2: movq caller_r12=%r12 +movq 616(%rsp),%r12 + +# qhasm: caller_r13 = r13_stack +# asm 1: movq caller_r13=int64#11 +# asm 2: movq caller_r13=%r13 +movq 624(%rsp),%r13 + +# qhasm: caller_r14 = r14_stack +# asm 1: movq caller_r14=int64#12 +# asm 2: movq caller_r14=%r14 +movq 632(%rsp),%r14 + +# qhasm: caller_r15 = r15_stack +# asm 1: movq caller_r15=int64#13 +# asm 2: movq caller_r15=%r15 +movq 640(%rsp),%r15 + +# qhasm: caller_rbx = rbx_stack +# asm 1: movq caller_rbx=int64#14 +# asm 2: movq caller_rbx=%rbx +movq 648(%rsp),%rbx + +# qhasm: return +add %r11,%rsp +ret diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec_reduce_asm.S b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec_reduce_asm.S new file mode 100644 index 0000000000..0c9caf1434 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864_sse/vec_reduce_asm.S @@ -0,0 +1,356 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 t + +# qhasm: int64 c + +# qhasm: int64 r + +# qhasm: enter vec_reduce_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864_SSE_vec_reduce_asm +.global PQCLEAN_MCELIECE348864_SSE_vec_reduce_asm +_PQCLEAN_MCELIECE348864_SSE_vec_reduce_asm: +PQCLEAN_MCELIECE348864_SSE_vec_reduce_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: r = 0 +# asm 1: mov $0,>r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t = mem64[ input_0 + 88 ] +# asm 1: movq 88(t=int64#2 +# asm 2: movq 88(t=%rsi +movq 88(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 80(t=%rsi +movq 80(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 72(t=%rsi +movq 72(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 64(t=%rsi +movq 64(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 56(t=%rsi +movq 56(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 48(t=%rsi +movq 48(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 40(t=%rsi +movq 40(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 32(t=%rsi +movq 32(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 24(t=%rsi +movq 24(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 16(t=%rsi +movq 16(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 8(t=%rsi +movq 8(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#1 +# asm 2: movq 0(t=%rdi +movq 0(%rdi),%rdi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rdi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE348864F_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/api.h b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/api.h new file mode 100644 index 0000000000..4dbdab6d2c --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/api.h @@ -0,0 +1,33 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_API_H +#define PQCLEAN_MCELIECE348864F_AVX_API_H + +#include + +#define PQCLEAN_MCELIECE348864F_AVX_CRYPTO_ALGNAME "Classic McEliece 348864f" +#define PQCLEAN_MCELIECE348864F_AVX_CRYPTO_PUBLICKEYBYTES 261120 +#define PQCLEAN_MCELIECE348864F_AVX_CRYPTO_SECRETKEYBYTES 6452 +#define PQCLEAN_MCELIECE348864F_AVX_CRYPTO_CIPHERTEXTBYTES 128 +#define PQCLEAN_MCELIECE348864F_AVX_CRYPTO_BYTES 32 + + + +int PQCLEAN_MCELIECE348864F_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE348864F_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE348864F_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/benes.c b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/benes.c new file mode 100644 index 0000000000..8d3f218da5 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/benes.c @@ -0,0 +1,287 @@ +/* + This file is for Benes network related functions +*/ +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_0(uint64_t *bs, const uint64_t *cond) { + int x; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = bs[ x ] ^ bs[ x + 1 ]; + diff &= *cond++; + bs[ x ] ^= diff; + bs[ x + 1 ] ^= diff; + } +} + +static void layer_1(uint64_t *bs, const uint64_t *cond) { + int x; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = bs[ x + 0 ] ^ bs[ x + 2 ]; + diff &= cond[0]; + bs[ x + 0 ] ^= diff; + bs[ x + 2 ] ^= diff; + + diff = bs[ x + 1 ] ^ bs[ x + 3 ]; + diff &= cond[1]; + bs[ x + 1 ] ^= diff; + bs[ x + 3 ] ^= diff; + + cond += 2; + } +} + +static void layer_2(uint64_t *bs, const uint64_t *cond) { + int x; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = bs[ x + 0 ] ^ bs[ x + 4 ]; + diff &= cond[0]; + bs[ x + 0 ] ^= diff; + bs[ x + 4 ] ^= diff; + + diff = bs[ x + 1 ] ^ bs[ x + 5 ]; + diff &= cond[1]; + bs[ x + 1 ] ^= diff; + bs[ x + 5 ] ^= diff; + + diff = bs[ x + 2 ] ^ bs[ x + 6 ]; + diff &= cond[2]; + bs[ x + 2 ] ^= diff; + bs[ x + 6 ] ^= diff; + + diff = bs[ x + 3 ] ^ bs[ x + 7 ]; + diff &= cond[3]; + bs[ x + 3 ] ^= diff; + bs[ x + 7 ] ^= diff; + + cond += 4; + } +} + +static void layer_3(uint64_t *bs, const uint64_t *cond) { + int x, s; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = bs[ s + 0 ] ^ bs[ s + 8 ]; + diff &= cond[0]; + bs[ s + 0 ] ^= diff; + bs[ s + 8 ] ^= diff; + + diff = bs[ s + 1 ] ^ bs[ s + 9 ]; + diff &= cond[1]; + bs[ s + 1 ] ^= diff; + bs[ s + 9 ] ^= diff; + + diff = bs[ s + 2 ] ^ bs[ s + 10 ]; + diff &= cond[2]; + bs[ s + 2 ] ^= diff; + bs[ s + 10 ] ^= diff; + + diff = bs[ s + 3 ] ^ bs[ s + 11 ]; + diff &= cond[3]; + bs[ s + 3 ] ^= diff; + bs[ s + 11 ] ^= diff; + + cond += 4; + } + } +} + +static void layer_4(uint64_t *bs, const uint64_t *cond) { + int x, s; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = bs[ s + 0 ] ^ bs[ s + 16 ]; + diff &= cond[0]; + bs[ s + 0 ] ^= diff; + bs[ s + 16 ] ^= diff; + + diff = bs[ s + 1 ] ^ bs[ s + 17 ]; + diff &= cond[1]; + bs[ s + 1 ] ^= diff; + bs[ s + 17 ] ^= diff; + + diff = bs[ s + 2 ] ^ bs[ s + 18 ]; + diff &= cond[2]; + bs[ s + 2 ] ^= diff; + bs[ s + 18 ] ^= diff; + + diff = bs[ s + 3 ] ^ bs[ s + 19 ]; + diff &= cond[3]; + bs[ s + 3 ] ^= diff; + bs[ s + 19 ] ^= diff; + + cond += 4; + } + } +} + +static void layer_5(uint64_t *bs, const uint64_t *cond) { + int x, s; + uint64_t diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = bs[ s + 0 ] ^ bs[ s + 32 ]; + diff &= cond[0]; + bs[ s + 0 ] ^= diff; + bs[ s + 32 ] ^= diff; + + diff = bs[ s + 1 ] ^ bs[ s + 33 ]; + diff &= cond[1]; + bs[ s + 1 ] ^= diff; + bs[ s + 33 ] ^= diff; + + diff = bs[ s + 2 ] ^ bs[ s + 34 ]; + diff &= cond[2]; + bs[ s + 2 ] ^= diff; + bs[ s + 34 ] ^= diff; + + diff = bs[ s + 3 ] ^ bs[ s + 35 ]; + diff &= cond[3]; + bs[ s + 3 ] ^= diff; + bs[ s + 35 ] ^= diff; + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: out, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE348864F_AVX_load_bits(uint64_t out[][32], const unsigned char *bits) { + int i, low, block = 0; + + uint64_t cond[64]; + + // + + for (low = 0; low <= 5; low++) { + for (i = 0; i < 64; i++) { + cond[i] = PQCLEAN_MCELIECE348864F_AVX_load4(bits + block * 256 + i * 4); + } + PQCLEAN_MCELIECE348864F_AVX_transpose_64x64(cond); + + for (i = 0; i < 32; i++) { + out[ block ][i] = cond[i]; + } + block++; + } + + for (low = 0; low <= 5; low++) { + for (i = 0; i < 32; i++) { + out[ block ][i] = PQCLEAN_MCELIECE348864F_AVX_load8(bits + block * 256 + i * 8); + } + block++; + } + + for (low = 4; low >= 0; low--) { + for (i = 0; i < 32; i++) { + out[ block ][i] = PQCLEAN_MCELIECE348864F_AVX_load8(bits + block * 256 + i * 8); + } + block++; + } + + for (low = 5; low >= 0; low--) { + for (i = 0; i < 64; i++) { + cond[i] = PQCLEAN_MCELIECE348864F_AVX_load4(bits + block * 256 + i * 4); + } + PQCLEAN_MCELIECE348864F_AVX_transpose_64x64(cond); + + for (i = 0; i < 32; i++) { + out[ block ][i] = cond[i]; + } + block++; + } +} + +/* input: r, sequence of bits to be permuted */ +/* cond, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE348864F_AVX_benes(uint64_t *r, uint64_t cond[][32], int rev) { + int block, inc; + + uint64_t *bs = r; + + // + + if (rev == 0) { + block = 0; + inc = 1; + } else { + block = 22; + inc = -1; + } + + PQCLEAN_MCELIECE348864F_AVX_transpose_64x64(bs); + + layer_0(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_5(bs, cond[ block ]); + block += inc; + + PQCLEAN_MCELIECE348864F_AVX_transpose_64x64(bs); + + layer_0(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_5(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_0(bs, cond[ block ]); + block += inc; + + PQCLEAN_MCELIECE348864F_AVX_transpose_64x64(bs); + + layer_5(bs, cond[ block ]); + block += inc; + layer_4(bs, cond[ block ]); + block += inc; + layer_3(bs, cond[ block ]); + block += inc; + layer_2(bs, cond[ block ]); + block += inc; + layer_1(bs, cond[ block ]); + block += inc; + layer_0(bs, cond[ block ]); + //block += inc; + + PQCLEAN_MCELIECE348864F_AVX_transpose_64x64(bs); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/benes.h b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/benes.h new file mode 100644 index 0000000000..53398f3e22 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/benes.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_BENES_H +#define PQCLEAN_MCELIECE348864F_AVX_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "gf.h" +#include "vec128.h" + +void PQCLEAN_MCELIECE348864F_AVX_load_bits(uint64_t /*out*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE348864F_AVX_benes(uint64_t * /*r*/, uint64_t /*cond*/[][32], int /*rev*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/bm.c b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/bm.c new file mode 100644 index 0000000000..bfd6bdcb6e --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/bm.c @@ -0,0 +1,219 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "util.h" +#include "vec.h" +#include "vec128.h" + +#include + +extern void PQCLEAN_MCELIECE348864F_AVX_update_asm(void *, gf, int); +extern gf PQCLEAN_MCELIECE348864F_AVX_vec_reduce_asm(uint64_t *); + +static inline uint64_t mask_nonzero(gf a) { + uint64_t ret = a; + + ret -= 1; + ret >>= 63; + ret -= 1; + + return ret; +} + +static inline uint64_t mask_leq(uint16_t a, uint16_t b) { + uint64_t a_tmp = a; + uint64_t b_tmp = b; + uint64_t ret = b_tmp - a_tmp; + + ret >>= 63; + ret -= 1; + + return ret; +} + +static inline void vec_cmov(uint64_t out[][2], uint64_t mask) { + int i; + + for (i = 0; i < GFBITS; i++) { + out[i][0] = (out[i][0] & ~mask) | (out[i][1] & mask); + } +} + +static inline void interleave(vec128 *in, int idx0, int idx1, vec128 *mask, int b) { + int s = 1 << b; + + vec128 x, y; + + x = PQCLEAN_MCELIECE348864F_AVX_vec128_or(PQCLEAN_MCELIECE348864F_AVX_vec128_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE348864F_AVX_vec128_sll_2x(PQCLEAN_MCELIECE348864F_AVX_vec128_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE348864F_AVX_vec128_or(PQCLEAN_MCELIECE348864F_AVX_vec128_srl_2x(PQCLEAN_MCELIECE348864F_AVX_vec128_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE348864F_AVX_vec128_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec128 *in) { + int i, k; + + vec128 mask[4][2]; + vec128 buf[16]; + + for (i = 0; i < GFBITS; i++) { + buf[i] = in[i]; + } + for (i = GFBITS; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE348864F_AVX_vec128_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE348864F_AVX_vec128_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE348864F_AVX_vec128_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE348864F_AVX_vec128_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE348864F_AVX_vec128_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE348864F_AVX_vec128_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE348864F_AVX_vec128_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE348864F_AVX_vec128_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE348864F_AVX_vec128_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE348864F_AVX_vec128_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE348864F_AVX_vec128_extract(buf[i], 1) >> (k * 16)) & GFMASK; + } + } +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +void PQCLEAN_MCELIECE348864F_AVX_bm(uint64_t out[ GFBITS ], vec128 in[ GFBITS ]) { + uint16_t i; + uint16_t N, L; + + uint64_t prod[ GFBITS ]; + uint64_t in_tmp[ GFBITS ]; + + uint64_t db[ GFBITS ][ 2 ]; + uint64_t BC_tmp[ GFBITS ][ 2 ]; + uint64_t BC[ GFBITS ][ 2 ]; + + uint64_t mask, t; + + gf d, b, c0 = 1; + + gf coefs[SYS_T * 2]; + + // init + + BC[0][1] = 0; + BC[0][0] = 1; + BC[0][0] <<= 63; + + for (i = 1; i < GFBITS; i++) { + BC[i][0] = BC[i][1] = 0; + } + + b = 1; + L = 0; + + // + + get_coefs(coefs, in); + + for (i = 0; i < GFBITS; i++) { + in_tmp[i] = 0; + } + + for (N = 0; N < SYS_T * 2; N++) { + // computing d + + PQCLEAN_MCELIECE348864F_AVX_vec_mul_sp(prod, in_tmp, &BC[0][0]); + + PQCLEAN_MCELIECE348864F_AVX_update_asm(in_tmp, coefs[N], 8); + + d = PQCLEAN_MCELIECE348864F_AVX_vec_reduce_asm(prod); + + t = PQCLEAN_MCELIECE348864F_AVX_gf_mul2(c0, coefs[N], b); + + d ^= t & 0xFFFFFFFF; + + // 3 cases + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + db[i][0] = (d >> i) & 1; + db[i][0] = -db[i][0]; + db[i][1] = (b >> i) & 1; + db[i][1] = -db[i][1]; + } + + PQCLEAN_MCELIECE348864F_AVX_vec128_mul((vec128 *) BC_tmp, (vec128 *) db, (vec128 *) BC); + + vec_cmov(BC, mask); + + PQCLEAN_MCELIECE348864F_AVX_update_asm(BC, mask & c0, 16); + + for (i = 0; i < GFBITS; i++) { + BC[i][1] = BC_tmp[i][0] ^ BC_tmp[i][1]; + } + + c0 = t >> 32; + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + + } + + c0 = PQCLEAN_MCELIECE348864F_AVX_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + out[i] = (c0 >> i) & 1; + out[i] = -out[i]; + } + + PQCLEAN_MCELIECE348864F_AVX_vec_mul_sp(out, out, &BC[0][0]); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/bm.h b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/bm.h new file mode 100644 index 0000000000..6852977a16 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/bm.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_BM_H +#define PQCLEAN_MCELIECE348864F_AVX_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE348864F_AVX_bm(uint64_t * /*out*/, vec128 * /*in*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/consts.S b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/consts.S new file mode 100644 index 0000000000..63e6defe50 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/consts.S @@ -0,0 +1,33 @@ +.data + +# not supported on MacOS +#.section .rodata + +.globl PQCLEAN_MCELIECE348864F_AVX_MASK0_0 +.globl PQCLEAN_MCELIECE348864F_AVX_MASK0_1 +.globl PQCLEAN_MCELIECE348864F_AVX_MASK1_0 +.globl PQCLEAN_MCELIECE348864F_AVX_MASK1_1 +.globl PQCLEAN_MCELIECE348864F_AVX_MASK2_0 +.globl PQCLEAN_MCELIECE348864F_AVX_MASK2_1 +.globl PQCLEAN_MCELIECE348864F_AVX_MASK3_0 +.globl PQCLEAN_MCELIECE348864F_AVX_MASK3_1 +.globl PQCLEAN_MCELIECE348864F_AVX_MASK4_0 +.globl PQCLEAN_MCELIECE348864F_AVX_MASK4_1 +.globl PQCLEAN_MCELIECE348864F_AVX_MASK5_0 +.globl PQCLEAN_MCELIECE348864F_AVX_MASK5_1 + +.p2align 5 + +PQCLEAN_MCELIECE348864F_AVX_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE348864F_AVX_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE348864F_AVX_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333, 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE348864F_AVX_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE348864F_AVX_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE348864F_AVX_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE348864F_AVX_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE348864F_AVX_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE348864F_AVX_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE348864F_AVX_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE348864F_AVX_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE348864F_AVX_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/consts.inc b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/consts.inc new file mode 100644 index 0000000000..c93f9c7994 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/consts.inc @@ -0,0 +1,238 @@ +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0FF00FF00FF00FF0, 0x0FF00FF00FF00FF0, 0x0FF00FF00FF00FF0, 0x0FF00FF00FF00FF0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAA5555AAAA5555AA, 0xAA5555AAAA5555AA, 0xAA5555AAAA5555AA, 0xAA5555AAAA5555AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x33CC33CC33CC33CC, 0x33CC33CC33CC33CC, 0x33CC33CC33CC33CC, 0x33CC33CC33CC33CC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x5A5A5A5A5A5A5A5A, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5A5A5A5A5A, 0x5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0, 0xF00F0FF0F00F0FF0), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3C3C3C3C3C3C3C3C, 0x3C3C3C3C3C3C3C3C, 0x3C3C3C3C3C3C3C3C, 0x3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555, 0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCC3333CCCC3333CC, 0xCC3333CCCC3333CC, 0xCC3333CCCC3333CC, 0xCC3333CCCC3333CC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55555555AAAAAAAA, 0x55555555AAAAAAAA, 0x55555555AAAAAAAA, 0x55555555AAAAAAAA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x33333333CCCCCCCC, 0x33333333CCCCCCCC, 0x33333333CCCCCCCC, 0x33333333CCCCCCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FF00FFFF00FF00, 0xFF00FF0000FF00FF, 0x00FF00FFFF00FF00, 0xFF00FF0000FF00FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000FFFFFFFF0000, 0x0000FFFFFFFF0000, 0x0000FFFFFFFF0000, 0x0000FFFFFFFF0000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0, 0xF0F00F0F0F0FF0F0), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAA55AA5555AA55AA, 0x55AA55AAAA55AA55, 0xAA55AA5555AA55AA, 0x55AA55AAAA55AA55), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCC33CC3333CC33CC, 0xCC33CC3333CC33CC, 0x33CC33CCCC33CC33, 0x33CC33CCCC33CC33), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33, 0x33CCCC33CC3333CC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55555555AAAAAAAA, 0x55555555AAAAAAAA, 0x55555555AAAAAAAA, 0x55555555AAAAAAAA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF0000FF00FFFF00, 0xFF0000FF00FFFF00, 0x00FFFF00FF0000FF, 0x00FFFF00FF0000FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3CC33CC3C33CC33C, 0xC33CC33C3CC33CC3, 0x3CC33CC3C33CC33C, 0xC33CC33C3CC33CC3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555, 0x5555AAAA5555AAAA, 0xAAAA5555AAAA5555), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0FF00FF00FF00FF0, 0xF00FF00FF00FF00F, 0x0FF00FF00FF00FF0, 0xF00FF00FF00FF00F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCC33333333CCCC, 0x3333CCCCCCCC3333, 0x3333CCCCCCCC3333, 0xCCCC33333333CCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF, 0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C, 0xC33CC33CC33CC33C), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x6699669999669966, 0x9966996666996699, 0x6699669999669966, 0x9966996666996699), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC, 0x33CCCC33CC3333CC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xA5A5A5A55A5A5A5A, 0xA5A5A5A55A5A5A5A, 0x5A5A5A5AA5A5A5A5, 0x5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C, 0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C, 0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F, 0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33, 0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x6699669999669966, 0x9966996666996699, 0x6699669999669966, 0x9966996666996699), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x5A5A5A5AA5A5A5A5, 0x5A5A5A5AA5A5A5A5, 0xA5A5A5A55A5A5A5A, 0xA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C, 0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0, 0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3C3CC3C3C3C33C3C, 0x3C3CC3C3C3C33C3C, 0xC3C33C3C3C3CC3C3, 0xC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F, 0x0F0F0F0FF0F0F0F0, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33, 0x33CCCC33CC3333CC, 0xCC3333CC33CCCC33), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x9669699696696996, 0x9669699696696996, 0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x6996699669966996, 0x6996699669966996, 0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x6996699669966996, 0x6996699669966996, 0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F, 0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x9669699696696996, 0x9669699696696996, 0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x6996699669966996, 0x6996699669966996, 0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x9669966996699669, 0x9669966996699669, 0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF, 0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F, 0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x9669699696696996, 0x9669699696696996, 0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x9669966996699669, 0x9669966996699669, 0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x9669966996699669, 0x9669966996699669, 0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00, 0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F, 0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x9669699696696996, 0x9669699696696996, 0x9669699696696996, 0x9669699696696996), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x9669966996699669, 0x9669966996699669, 0x9669966996699669, 0x9669966996699669), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x6996699669966996, 0x6996699669966996, 0x6996699669966996, 0x6996699669966996), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF0000FFFF0000FF, 0xFF0000FFFF0000FF, 0x00FFFF0000FFFF00, 0x00FFFF0000FFFF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF00FF00F0FF00FF0, 0x0FF00FF0F00FF00F, 0x0FF00FF0F00FF00F, 0xF00FF00F0FF00FF0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F00F0F0F0FF0F0, 0x0F0FF0F0F0F00F0F, 0x0F0FF0F0F0F00F0F, 0xF0F00F0F0F0FF0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3, 0xC33C3CC33CC3C33C, 0x3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A, 0xA55A5AA55AA5A55A), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C, 0xC33C3CC33CC3C33C), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C, 0x3CC3C33C3CC3C33C), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/controlbits.c b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/controlbits.c new file mode 100644 index 0000000000..08caa6ef75 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE348864F_AVX_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE348864F_AVX_sort_63b(n / 2, x); + PQCLEAN_MCELIECE348864F_AVX_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE348864F_AVX_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/controlbits.h b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/controlbits.h new file mode 100644 index 0000000000..e70e727d80 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_CONTROLBITS_H +#define PQCLEAN_MCELIECE348864F_AVX_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE348864F_AVX_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE348864F_AVX_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/crypto_hash.h b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/crypto_hash.h new file mode 100644 index 0000000000..a294e428ce --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE348864F_AVX_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/decrypt.c b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/decrypt.c new file mode 100644 index 0000000000..7737514cad --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/decrypt.c @@ -0,0 +1,234 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec256 out[][GFBITS], vec256 inv[][GFBITS], const unsigned char *sk, vec256 *recv) { + int i, j; + + uint64_t sk_int[ GFBITS ]; + vec256 eval[16][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + // computing inverses + + PQCLEAN_MCELIECE348864F_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE348864F_AVX_fft(eval, sk_int); + + for (i = 0; i < 16; i++) { + PQCLEAN_MCELIECE348864F_AVX_vec256_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE348864F_AVX_vec256_copy(inv[0], eval[0]); + + for (i = 1; i < 16; i++) { + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE348864F_AVX_vec256_inv(tmp, inv[15]); + + for (i = 14; i >= 0; i--) { + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE348864F_AVX_vec256_copy(inv[0], tmp); + + // + + for (i = 0; i < 16; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE348864F_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + unsigned char r[ 512 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 512; i++) { + r[i] = 0; + } + + for (i = 0; i < 32; i++) { + recv[i] = PQCLEAN_MCELIECE348864F_AVX_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE348864F_AVX_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE348864F_AVX_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE348864F_AVX_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE348864F_AVX_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec256 out[][GFBITS], vec256 inv[][GFBITS], vec256 *recv) { + int i, j; + + for (i = 0; i < 16; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE348864F_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 32; i++) { + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE348864F_AVX_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE348864F_AVX_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u64( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec128 *s0, vec128 *s1) { + int i; + vec128 diff; + + diff = PQCLEAN_MCELIECE348864F_AVX_vec128_xor(s0[0], s1[0]); + + for (i = 1; i < GFBITS; i++) { + diff = PQCLEAN_MCELIECE348864F_AVX_vec128_or(diff, PQCLEAN_MCELIECE348864F_AVX_vec128_xor(s0[i], s1[i])); + } + + return (uint16_t)PQCLEAN_MCELIECE348864F_AVX_vec128_testz(diff); +} + +static void reformat_128to256(vec256 *out, vec128 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 16; i++) { + v[0] = PQCLEAN_MCELIECE348864F_AVX_vec128_extract(in[2 * i + 0], 0); + v[1] = PQCLEAN_MCELIECE348864F_AVX_vec128_extract(in[2 * i + 0], 1); + v[2] = PQCLEAN_MCELIECE348864F_AVX_vec128_extract(in[2 * i + 1], 0); + v[3] = PQCLEAN_MCELIECE348864F_AVX_vec128_extract(in[2 * i + 1], 1); + + out[i] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +static void reformat_256to128(vec128 *out, vec256 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 16; i++) { + v[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[i], 3); + + out[2 * i + 0] = PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(v[0], v[1]); + out[2 * i + 1] = PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(v[2], v[3]); + } +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE348864F_AVX_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec256 inv[ 16 ][ GFBITS ]; + vec256 scaled[ 16 ][ GFBITS ]; + vec256 eval[16][ GFBITS ]; + + vec128 error128[ 32 ]; + vec256 error256[ 16 ]; + + vec128 s_priv[ GFBITS ]; + vec128 s_priv_cmp[ GFBITS ]; + uint64_t locator[ GFBITS ]; + + vec128 recv128[ 32 ]; + vec256 recv256[ 16 ]; + vec256 allone; + + uint64_t bits_int[23][32]; + + // Berlekamp decoder + + preprocess(recv128, c); + + PQCLEAN_MCELIECE348864F_AVX_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE348864F_AVX_benes((uint64_t *) recv128, bits_int, 1); + + reformat_128to256(recv256, recv128); + + scaling(scaled, inv, sk, recv256); + PQCLEAN_MCELIECE348864F_AVX_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE348864F_AVX_bm(locator, s_priv); + + PQCLEAN_MCELIECE348864F_AVX_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE348864F_AVX_vec256_set1_16b(0xFFFF); + + for (i = 0; i < 16; i++) { + error256[i] = PQCLEAN_MCELIECE348864F_AVX_vec256_or_reduce(eval[i]); + error256[i] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(error256[i], allone); + } + + scaling_inv(scaled, inv, error256); + PQCLEAN_MCELIECE348864F_AVX_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + reformat_256to128(error128, error256); + PQCLEAN_MCELIECE348864F_AVX_benes((uint64_t *) error128, bits_int, 0); + + postprocess(e, error128); + + check_weight = weight_check(e, error128); + + return 1 - (check_synd & check_weight); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/decrypt.h b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/decrypt.h new file mode 100644 index 0000000000..962f540224 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_DECRYPT_H +#define PQCLEAN_MCELIECE348864F_AVX_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE348864F_AVX_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/encrypt.c b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/encrypt.c new file mode 100644 index 0000000000..09283ffd2e --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/encrypt.c @@ -0,0 +1,99 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "gf.h" +#include "int32_sort.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE348864F_AVX_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + int i, j, eq, count; + + uint16_t ind[ SYS_T * 2 ]; + int32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((uint8_t *) ind, sizeof(ind)); + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + PQCLEAN_MCELIECE348864F_AVX_int32_sort(ind32, SYS_T); + + eq = 0; + for (i = 1; i < SYS_T; i++) { + if (ind32[i - 1] == ind32[i]) { + eq = 1; + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE348864F_AVX_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +void PQCLEAN_MCELIECE348864F_AVX_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + PQCLEAN_MCELIECE348864F_AVX_syndrome_asm(s, pk, e); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/encrypt.h b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/encrypt.h new file mode 100644 index 0000000000..ea547501f7 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_ENCRYPT_H +#define PQCLEAN_MCELIECE348864F_AVX_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE348864F_AVX_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/fft.c b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/fft.c new file mode 100644 index 0000000000..74f95edb54 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/fft.c @@ -0,0 +1,172 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "vec.h" + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(uint64_t *in) { + int i, j, k; + + const uint64_t mask[5][2] = { + {0x8888888888888888, 0x4444444444444444}, + {0xC0C0C0C0C0C0C0C0, 0x3030303030303030}, + {0xF000F000F000F000, 0x0F000F000F000F00}, + {0xFF000000FF000000, 0x00FF000000FF0000}, + {0xFFFF000000000000, 0x0000FFFF00000000} + }; + + const uint64_t s[5][GFBITS] = { +#include "scalars.inc" + }; + + // + + for (j = 0; j <= 4; j++) { + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + in[i] ^= (in[i] & mask[k][0]) >> (1 << k); + in[i] ^= (in[i] & mask[k][1]) >> (1 << k); + } + } + + PQCLEAN_MCELIECE348864F_AVX_vec_mul(in, in, s[j]); // scaling + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec256 out[][ GFBITS ], const uint64_t *in) { + int i, j, k, s, b; + + uint64_t t0, t1, t2, t3; + + const vec256 consts[ 17 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 0; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + // boradcast + + vec256 tmp256[ GFBITS ]; + vec256 x[ GFBITS ], y[ GFBITS ]; + + for (j = 0; j < 64; j += 8) { + for (i = 0; i < GFBITS; i++) { + t0 = (in[i] >> reversal[j + 0]) & 1; + t0 = -t0; + t1 = (in[i] >> reversal[j + 2]) & 1; + t1 = -t1; + t2 = (in[i] >> reversal[j + 4]) & 1; + t2 = -t2; + t3 = (in[i] >> reversal[j + 6]) & 1; + t3 = -t3; + + out[j / 4 + 0][i] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(t0, t1, t2, t3); + + t0 = (in[i] >> reversal[j + 1]) & 1; + t0 = -t0; + t1 = (in[i] >> reversal[j + 3]) & 1; + t1 = -t1; + t2 = (in[i] >> reversal[j + 5]) & 1; + t2 = -t2; + t3 = (in[i] >> reversal[j + 7]) & 1; + t3 = -t3; + + out[j / 4 + 1][i] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(t0, t1, t2, t3); + } + } + + // + + for (i = 0; i < 16; i += 2) { + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(tmp256, out[i + 1], consts[ 0 ]); + + for (b = 0; b < GFBITS; b++) { + out[i + 0][b] ^= tmp256[b]; + } + for (b = 0; b < GFBITS; b++) { + out[i + 1][b] ^= out[i + 0][b]; + } + } + + for (i = 0; i < 16; i += 2) { + for (b = 0; b < GFBITS; b++) { + x[b] = PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_low_2x(out[i + 0][b], out[i + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + y[b] = PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_high_2x(out[i + 0][b], out[i + 1][b]); + } + + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(tmp256, y, consts[ 1 ]); + + for (b = 0; b < GFBITS; b++) { + x[b] ^= tmp256[b]; + } + for (b = 0; b < GFBITS; b++) { + y[b] ^= x[b]; + } + + for (b = 0; b < GFBITS; b++) { + out[i + 0][b] = PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_low(x[b], y[b]); + } + for (b = 0; b < GFBITS; b++) { + out[i + 1][b] = PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_high(x[b], y[b]); + } + } + + consts_ptr = 2; + + for (i = 0; i <= 3; i++) { + s = 1 << i; + + for (j = 0; j < 16; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(tmp256, out[k + s], consts[ consts_ptr + (k - j) ]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] ^= tmp256[b]; + } + for (b = 0; b < GFBITS; b++) { + out[k + s][b] ^= out[k][b]; + } + } + } + + consts_ptr += s; + } + + // adding the part contributed by x^64 + + vec256 powers[16][GFBITS] = { +#include "powers.inc" + }; + + for (i = 0; i < 16; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(out[i][b], powers[i][b]); + } + } +} + +void PQCLEAN_MCELIECE348864F_AVX_fft(vec256 out[][ GFBITS ], uint64_t *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/fft.h b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/fft.h new file mode 100644 index 0000000000..fe39685a46 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/fft.h @@ -0,0 +1,18 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_FFT_H +#define PQCLEAN_MCELIECE348864F_AVX_FFT_H + +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include + +#include "params.h" +#include "vec128.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE348864F_AVX_fft(vec256 /*out*/[][GFBITS], uint64_t * /*in*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/fft_tr.c b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/fft_tr.c new file mode 100644 index 0000000000..a8e1467d98 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/fft_tr.c @@ -0,0 +1,355 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" +#include "vec.h" + +#include + +static void radix_conversions_tr(vec128 in[ GFBITS ]) { + int i, j, k; + + const vec128 mask[10] = { + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + uint64_t v0, v1; + + // + + for (j = 5; j >= 0; j--) { + + if (j < 5) { + PQCLEAN_MCELIECE348864F_AVX_vec128_mul(in, in, s[j]); + } + + for (i = 0; i < GFBITS; i++) { + for (k = j; k <= 4; k++) { + in[i] ^= PQCLEAN_MCELIECE348864F_AVX_vec128_sll_2x(in[i] & mask[2 * k + 0], 1 << k); + in[i] ^= PQCLEAN_MCELIECE348864F_AVX_vec128_sll_2x(in[i] & mask[2 * k + 1], 1 << k); + } + } + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE348864F_AVX_vec128_extract(in[i], 0); + v1 = PQCLEAN_MCELIECE348864F_AVX_vec128_extract(in[i], 1); + + v1 ^= v0 >> 32; + v1 ^= v1 << 32; + + in[i] = PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(v0, v1); + } + } +} + +static void butterflies_tr(vec128 out[ GFBITS ], vec256 in[][ GFBITS ]) { + int i, j, k, s, b; + + uint64_t tmp[ GFBITS ]; + uint64_t pre[6][ GFBITS ]; + uint64_t out64[2][64]; + + vec256 p2[ 6 ]; + vec256 buf[64]; + vec256 x[ GFBITS ], y[ GFBITS ]; + vec256 tmp256[ GFBITS ]; + + const vec256 consts[ 17 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 17; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {8, 1300, 3408, 1354, 2341, 1154}; + + // butterflies + + for (i = 3; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 16; j += 2 * s) { + for (k = j; k < j + s; k++) { + for (b = 0; b < GFBITS; b++) { + in[k][b] ^= in[k + s][b]; + } + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(tmp256, in[k], consts[ consts_ptr + (k - j) ]); + for (b = 0; b < GFBITS; b++) { + in[k + s][b] ^= tmp256[b]; + } + } + } + } + + for (i = 0; i < 16; i += 2) { + for (b = 0; b < GFBITS; b++) { + x[b] = PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_low(in[i + 0][b], in[i + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + y[b] = PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_high(in[i + 0][b], in[i + 1][b]); + } + + for (b = 0; b < GFBITS; b++) { + x[b] ^= y[b]; + } + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(tmp256, x, consts[ 1 ]); + for (b = 0; b < GFBITS; b++) { + y[b] ^= tmp256[b]; + } + + for (b = 0; b < GFBITS; b++) { + in[i + 0][b] = PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_low_2x(x[b], y[b]); + } + for (b = 0; b < GFBITS; b++) { + in[i + 1][b] = PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_high_2x(x[b], y[b]); + } + } + + for (i = 0; i < 16; i += 2) { + for (b = 0; b < GFBITS; b++) { + in[i + 0][b] ^= in[i + 1][b]; + } + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(tmp256, in[i + 0], consts[ 0 ]); + for (b = 0; b < GFBITS; b++) { + in[i + 1][b] ^= tmp256[b]; + } + } + + // transpose + + for (i = 0; i < GFBITS; i += 4) { + for (j = 0; j < 64; j += 8) { + buf[ reversal[j + 0] ] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 0], 0), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 1], 0), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 2], 0), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 3], 0)); + buf[ reversal[j + 1] ] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 0], 0), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 1], 0), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 2], 0), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 3], 0)); + buf[ reversal[j + 2] ] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 0], 1), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 1], 1), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 2], 1), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 3], 1)); + buf[ reversal[j + 3] ] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 0], 1), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 1], 1), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 2], 1), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 3], 1)); + buf[ reversal[j + 4] ] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 0], 2), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 1], 2), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 2], 2), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 3], 2)); + buf[ reversal[j + 5] ] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 0], 2), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 1], 2), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 2], 2), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 3], 2)); + buf[ reversal[j + 6] ] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 0], 3), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 1], 3), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 2], 3), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 0][i + 3], 3)); + buf[ reversal[j + 7] ] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 0], 3), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 1], 3), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 2], 3), + PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[j / 4 + 1][i + 3], 3)); + } + + PQCLEAN_MCELIECE348864F_AVX_transpose_64x256_sp(buf); + + p2[0] = buf[32]; + buf[33] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[33], buf[32]); + p2[1] = buf[33]; + buf[35] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[35], buf[33]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[35]); + buf[34] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[34], buf[35]); + p2[2] = buf[34]; + buf[38] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[38], buf[34]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[38]); + buf[39] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[39], buf[38]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[39]); + buf[37] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[37], buf[39]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[37]); + buf[36] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[36], buf[37]); + p2[3] = buf[36]; + buf[44] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[44], buf[36]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[44]); + buf[45] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[45], buf[44]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[45]); + buf[47] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[47], buf[45]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[47]); + buf[46] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[46], buf[47]); + p2[2] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[2], buf[46]); + buf[42] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[42], buf[46]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[42]); + buf[43] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[43], buf[42]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[43]); + buf[41] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[41], buf[43]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[41]); + buf[40] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[40], buf[41]); + p2[4] = buf[40]; + buf[56] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[56], buf[40]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[56]); + buf[57] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[57], buf[56]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[57]); + buf[59] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[59], buf[57]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[59]); + buf[58] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[58], buf[59]); + p2[2] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[2], buf[58]); + buf[62] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[62], buf[58]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[62]); + buf[63] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[63], buf[62]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[63]); + buf[61] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[61], buf[63]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[61]); + buf[60] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[60], buf[61]); + p2[3] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[3], buf[60]); + buf[52] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[52], buf[60]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[52]); + buf[53] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[53], buf[52]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[53]); + buf[55] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[55], buf[53]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[55]); + buf[54] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[54], buf[55]); + p2[2] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[2], buf[54]); + buf[50] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[50], buf[54]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[50]); + buf[51] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[51], buf[50]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[51]); + buf[49] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[49], buf[51]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[49]); + buf[48] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[48], buf[49]); + p2[5] = buf[48]; + buf[16] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[16], buf[48]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[16]); + buf[17] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[17], buf[16]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[17]); + buf[19] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[19], buf[17]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[19]); + buf[18] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[18], buf[19]); + p2[2] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[2], buf[18]); + buf[22] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[22], buf[18]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[22]); + buf[23] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[23], buf[22]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[23]); + buf[21] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[21], buf[23]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[21]); + buf[20] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[20], buf[21]); + p2[3] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[3], buf[20]); + buf[28] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[28], buf[20]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[28]); + buf[29] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[29], buf[28]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[29]); + buf[31] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[31], buf[29]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[31]); + buf[30] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[30], buf[31]); + p2[2] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[2], buf[30]); + buf[26] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[26], buf[30]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[26]); + buf[27] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[27], buf[26]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[27]); + buf[25] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[25], buf[27]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[25]); + buf[24] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[24], buf[25]); + p2[4] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[4], buf[24]); + buf[8] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[8], buf[24]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[8]); + buf[9] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[9], buf[8]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[9]); + buf[11] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[11], buf[9]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[11]); + buf[10] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[10], buf[11]); + p2[2] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[2], buf[10]); + buf[14] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[14], buf[10]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[14]); + buf[15] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[15], buf[14]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[15]); + buf[13] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[13], buf[15]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[13]); + buf[12] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[12], buf[13]); + p2[3] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[3], buf[12]); + buf[4] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[4], buf[12]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[4]); + buf[5] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[5], buf[4]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[5]); + buf[7] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[7], buf[5]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[7]); + buf[6] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[6], buf[7]); + p2[2] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[2], buf[6]); + buf[2] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[2], buf[6]); + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[2]); + buf[3] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[3], buf[2]); + p2[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[1], buf[3]); + buf[1] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[1], buf[3]); + + p2[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(p2[0], buf[1]); + buf[0] = PQCLEAN_MCELIECE348864F_AVX_vec256_xor(buf[0], buf[1]); + + for (j = 0; j < 6; j++) { + pre[j][i + 0] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(p2[j], 0); + pre[j][i + 1] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(p2[j], 1); + pre[j][i + 2] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(p2[j], 2); + pre[j][i + 3] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(p2[j], 3); + } + + out64[0][i + 0] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(buf[0], 0); + out64[0][i + 1] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(buf[0], 1); + out64[0][i + 2] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(buf[0], 2); + out64[0][i + 3] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(buf[0], 3); + } + + // + + for (j = 0; j < GFBITS; j++) { + tmp[j] = (beta[0] >> j) & 1; + tmp[j] = -tmp[j]; + } + + PQCLEAN_MCELIECE348864F_AVX_vec_mul(out64[1], pre[0], tmp); + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = (beta[i] >> j) & 1; + tmp[j] = -tmp[j]; + } + + PQCLEAN_MCELIECE348864F_AVX_vec_mul(tmp, pre[i], tmp); + PQCLEAN_MCELIECE348864F_AVX_vec_add(out64[1], out64[1], tmp); + } + + for (i = 0; i < GFBITS; i++) { + out[i] = PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(out64[0][i], out64[1][i]); + } +} + +void PQCLEAN_MCELIECE348864F_AVX_fft_tr(vec128 out[GFBITS], vec256 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/fft_tr.h b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/fft_tr.h new file mode 100644 index 0000000000..0b65c32440 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_FFT_TR_H +#define PQCLEAN_MCELIECE348864F_AVX_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE348864F_AVX_fft_tr(vec128 * /*out*/, vec256 /*in*/[][ GFBITS ]); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/gf.c b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/gf.c new file mode 100644 index 0000000000..f452be9fed --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/gf.c @@ -0,0 +1,169 @@ +/* + This file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +gf PQCLEAN_MCELIECE348864F_AVX_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 20; + + return (gf) t; +} + +gf PQCLEAN_MCELIECE348864F_AVX_gf_add(gf in0, gf in1) { + return in0 ^ in1; +} + +gf PQCLEAN_MCELIECE348864F_AVX_gf_mul(gf in0, gf in1) { + int i; + + uint32_t tmp; + uint32_t t0; + uint32_t t1; + uint32_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & (1 << i))); + } + + t = tmp & 0x7FC000; + tmp ^= t >> 9; + tmp ^= t >> 12; + + t = tmp & 0x3000; + tmp ^= t >> 9; + tmp ^= t >> 12; + + return tmp & ((1 << GFBITS) - 1); +} + +/* input: field element in */ +/* return: in^2 */ +static inline gf gf_sq(gf in) { + const uint32_t B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF}; + + uint32_t x = in; + uint32_t t; + + x = (x | (x << 8)) & B[3]; + x = (x | (x << 4)) & B[2]; + x = (x | (x << 2)) & B[1]; + x = (x | (x << 1)) & B[0]; + + t = x & 0x7FC000; + x ^= t >> 9; + x ^= t >> 12; + + t = x & 0x3000; + x ^= t >> 9; + x ^= t >> 12; + + return x & ((1 << GFBITS) - 1); +} + +gf PQCLEAN_MCELIECE348864F_AVX_gf_inv(gf in) { + gf tmp_11; + gf tmp_1111; + + gf out = in; + + out = gf_sq(out); + tmp_11 = PQCLEAN_MCELIECE348864F_AVX_gf_mul(out, in); // 11 + + out = gf_sq(tmp_11); + out = gf_sq(out); + tmp_1111 = PQCLEAN_MCELIECE348864F_AVX_gf_mul(out, tmp_11); // 1111 + + out = gf_sq(tmp_1111); + out = gf_sq(out); + out = gf_sq(out); + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864F_AVX_gf_mul(out, tmp_1111); // 11111111 + + out = gf_sq(out); + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864F_AVX_gf_mul(out, tmp_11); // 1111111111 + + out = gf_sq(out); + out = PQCLEAN_MCELIECE348864F_AVX_gf_mul(out, in); // 11111111111 + + return gf_sq(out); // 111111111110 +} + +/* input: field element den, num */ +/* return: (num/den) */ +gf PQCLEAN_MCELIECE348864F_AVX_gf_frac(gf den, gf num) { + return PQCLEAN_MCELIECE348864F_AVX_gf_mul(PQCLEAN_MCELIECE348864F_AVX_gf_inv(den), num); +} + +/* input: in0, in1 in GF((2^m)^t)*/ +/* output: out = in0*in1 */ +void PQCLEAN_MCELIECE348864F_AVX_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[ SYS_T * 2 - 1 ]; + + for (i = 0; i < SYS_T * 2 - 1; i++) { + prod[i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + for (j = 0; j < SYS_T; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE348864F_AVX_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = (SYS_T - 1) * 2; i >= SYS_T; i--) { + prod[i - SYS_T + 9] ^= PQCLEAN_MCELIECE348864F_AVX_gf_mul(prod[i], (gf) 877); + prod[i - SYS_T + 7] ^= PQCLEAN_MCELIECE348864F_AVX_gf_mul(prod[i], (gf) 2888); + prod[i - SYS_T + 5] ^= PQCLEAN_MCELIECE348864F_AVX_gf_mul(prod[i], (gf) 1781); + prod[i - SYS_T + 0] ^= PQCLEAN_MCELIECE348864F_AVX_gf_mul(prod[i], (gf) 373); + } + + for (i = 0; i < SYS_T; i++) { + out[i] = prod[i]; + } +} + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE348864F_AVX_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x007FC000007FC000; + tmp ^= (t >> 9) ^ (t >> 12); + + t = tmp & 0x0000300000003000; + tmp ^= (t >> 9) ^ (t >> 12); + + return tmp & 0x00000FFF00000FFF; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/gf.h b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/gf.h new file mode 100644 index 0000000000..5b7ca3f31c --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/gf.h @@ -0,0 +1,26 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_GF_H +#define PQCLEAN_MCELIECE348864F_AVX_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include "params.h" + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE348864F_AVX_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE348864F_AVX_gf_add(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE348864F_AVX_gf_mul(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE348864F_AVX_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE348864F_AVX_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE348864F_AVX_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE348864F_AVX_gf_mul2(gf a, gf b0, gf b1); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/int32_sort.c b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/int32_sort.c new file mode 100644 index 0000000000..3d00867db2 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/int32_sort.c @@ -0,0 +1,1211 @@ +#include "int32_sort.h" +#define int32 int32_t + +#include + +typedef __m256i int32x8; +#define int32x8_load(z) _mm256_loadu_si256((__m256i_u *) (z)) +#define int32x8_store(z,i) _mm256_storeu_si256((__m256i_u *) (z),(i)) +#define int32x8_min _mm256_min_epi32 +#define int32x8_max _mm256_max_epi32 + +#define int32x8_MINMAX(a,b) \ + do { \ + int32x8 c = int32x8_min(a, b); \ + (b) = int32x8_max(a,b); \ + (a) = c; \ + } while(0) + +static inline void int32_MINMAX(int32 *a, int32 *b) { + int32 ab = *b ^ *a; + int32 c = *b - *a; + c ^= ab & (c ^ *b); + c >>= 31; + c &= ab; + *a ^= c; + *b ^= c; +} + +static void minmax_vector(int32 *x, int32 *y, size_t n) { + if (n < 8) { + while (n > 0) { + int32_MINMAX(x, y); + ++x; + ++y; + --n; + } + return; + } + if (n & 7) { + int32x8 x0 = int32x8_load(x + n - 8); + int32x8 y0 = int32x8_load(y + n - 8); + int32x8_MINMAX(x0, y0); + int32x8_store(x + n - 8, x0); + int32x8_store(y + n - 8, y0); + n &= ~7; + } + do { + int32x8 x0 = int32x8_load(x); + int32x8 y0 = int32x8_load(y); + int32x8_MINMAX(x0, y0); + int32x8_store(x, x0); + int32x8_store(y, y0); + x += 8; + y += 8; + n -= 8; + } while (n); +} + +/* stages 8,4,2,1 of size-16 bitonic merging */ +static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) { + int32x8 b0, b1, c0, c1, mask; + + int32x8_MINMAX(x0, x1); + + b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + + int32x8_MINMAX(b0, b1); + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */ + + x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */ + x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */ + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + } + + int32x8_store(&x[0], x0); + int32x8_store(&x[8], x1); +} + +/* stages 64,32 of bitonic merging; n is multiple of 128 */ +static void int32_twostages_32(int32 *x, size_t n) { + size_t i; + + while (n > 0) { + for (i = 0; i < 32; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + 32]); + int32x8 x2 = int32x8_load(&x[i + 64]); + int32x8 x3 = int32x8_load(&x[i + 96]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 32], x1); + int32x8_store(&x[i + 64], x2); + int32x8_store(&x[i + 96], x3); + } + x += 128; + n -= 128; + } +} + +/* stages 4q,2q,q of bitonic merging */ +static size_t int32_threestages(int32 *x, size_t n, size_t q) { + size_t k, i; + + for (k = 0; k + 8 * q <= n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + + return k; +} + +/* n is a power of 2; n >= 8; if n == 8 then flagdown */ +// NOLINTNEXTLINE(google-readability-function-size) +static void int32_sort_2power(int32 *x, size_t n, int flagdown) { + size_t p, q, i, j, k; + int32x8 mask; + + if (n == 8) { + int32 x0 = x[0]; + int32 x1 = x[1]; + int32 x2 = x[2]; + int32 x3 = x[3]; + int32 x4 = x[4]; + int32 x5 = x[5]; + int32 x6 = x[6]; + int32 x7 = x[7]; + + /* odd-even sort instead of bitonic sort */ + + int32_MINMAX(&x1, &x0); + int32_MINMAX(&x3, &x2); + int32_MINMAX(&x2, &x0); + int32_MINMAX(&x3, &x1); + int32_MINMAX(&x2, &x1); + + int32_MINMAX(&x5, &x4); + int32_MINMAX(&x7, &x6); + int32_MINMAX(&x6, &x4); + int32_MINMAX(&x7, &x5); + int32_MINMAX(&x6, &x5); + + int32_MINMAX(&x4, &x0); + int32_MINMAX(&x6, &x2); + int32_MINMAX(&x4, &x2); + + int32_MINMAX(&x5, &x1); + int32_MINMAX(&x7, &x3); + int32_MINMAX(&x5, &x3); + + int32_MINMAX(&x2, &x1); + int32_MINMAX(&x4, &x3); + int32_MINMAX(&x6, &x5); + + x[0] = x0; + x[1] = x1; + x[2] = x2; + x[3] = x3; + x[4] = x4; + x[5] = x5; + x[6] = x6; + x[7] = x7; + return; + } + + if (n == 16) { + int32x8 x0, x1, b0, b1, c0, c1; + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + + mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1); + + x0 ^= mask; /* A01234567 */ + x1 ^= mask; /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + c0 ^= mask; + c1 ^= mask; + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + b0 ^= mask; + b1 ^= mask; + + c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */ + c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */ + b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + mask = _mm256_set1_epi32(-1); + if (flagdown) { + x1 ^= mask; + } else { + x0 ^= mask; + } + + merge16_finish(x, x0, x1, flagdown); + return; + } + + if (n == 32) { + int32x8 x0, x1, x2, x3; + + int32_sort_2power(x, 16, 1); + int32_sort_2power(x + 16, 16, 0); + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + x2 = int32x8_load(&x[16]); + x3 = int32x8_load(&x[24]); + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + } + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + + merge16_finish(x, x0, x1, flagdown); + merge16_finish(x + 16, x2, x3, flagdown); + return; + } + + p = n >> 3; + for (i = 0; i < p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x2 = int32x8_load(&x[i + 2 * p]); + int32x8 x4 = int32x8_load(&x[i + 4 * p]); + int32x8 x6 = int32x8_load(&x[i + 6 * p]); + + /* odd-even stage instead of bitonic stage */ + + int32x8_MINMAX(x4, x0); + int32x8_MINMAX(x6, x2); + int32x8_MINMAX(x2, x0); + int32x8_MINMAX(x6, x4); + int32x8_MINMAX(x2, x4); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 2 * p], x2); + int32x8_store(&x[i + 4 * p], x4); + int32x8_store(&x[i + 6 * p], x6); + + int32x8 x1 = int32x8_load(&x[i + p]); + int32x8 x3 = int32x8_load(&x[i + 3 * p]); + int32x8 x5 = int32x8_load(&x[i + 5 * p]); + int32x8 x7 = int32x8_load(&x[i + 7 * p]); + + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x5, x3); + + int32x8_store(&x[i + p], x1); + int32x8_store(&x[i + 3 * p], x3); + int32x8_store(&x[i + 5 * p], x5); + int32x8_store(&x[i + 7 * p], x7); + } + + if (n >= 128) { + int flip, flipflip; + + mask = _mm256_set1_epi32(-1); + + for (j = 0; j < n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 16]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 16], x1); + } + + p = 8; + for (;;) { /* for p in [8, 16, ..., n/16] */ + q = p >> 1; + while (q >= 128) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + if (q == 64) { + int32_twostages_32(x, n); + q = 16; + } + if (q == 32) { + q = 8; + for (k = 0; k < n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q = 4; + } + if (q == 16) { + q = 8; + for (k = 0; k < n; k += 4 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q = 4; + } + if (q == 8) { + for (k = 0; k < n; k += q + q) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + } + } + + q = n >> 3; + flip = 0; + if (p << 1 == q) { + flip = 1; + } + flipflip = 1 - flip; + for (j = 0; j < q; j += p + p) { + for (k = j; k < j + p + p; k += p) { + for (i = k; i < k + p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + if (flip) { + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + x4 ^= mask; + x5 ^= mask; + x6 ^= mask; + x7 ^= mask; + } + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + flip ^= 1; + } + flip ^= flipflip; + } + + if (p << 4 == n) { + break; + } + p <<= 1; + } + } + + for (p = 4; p >= 1; p >>= 1) { + int32 *z = x; + int32 *target = x + n; + if (p == 4) { + mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&z[0], x0); + int32x8_store(&z[8], x1); + z += 16; + } + } else if (p == 2) { + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8_MINMAX(b0, b1); + int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20); + int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31); + int32x8_store(&z[0], c0); + int32x8_store(&z[8], c1); + z += 16; + } + } else { /* p == 1 */ + mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */ + int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */ + int32x8_MINMAX(d0, d1); + int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20); + int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31); + int32x8_store(&z[0], e0); + int32x8_store(&z[8], e1); + z += 16; + } + } + + q = n >> 4; + while (q >= 128 || q == 32) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (k = j; k < j + q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += 2 * q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (k = 0; k < q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + int32x8 x4 = int32x8_load(&x[k + 4 * q]); + int32x8 x5 = int32x8_load(&x[k + 5 * q]); + int32x8 x6 = int32x8_load(&x[k + 6 * q]); + int32x8 x7 = int32x8_load(&x[k + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + int32x8_store(&x[k + 4 * q], x4); + int32x8_store(&x[k + 5 * q], x5); + int32x8_store(&x[k + 6 * q], x6); + int32x8_store(&x[k + 7 * q], x7); + } + } + + /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */ + mask = _mm256_set1_epi32(-1); + + for (i = 0; i < n; i += 64) { + int32x8 a0 = int32x8_load(&x[i]); + int32x8 a1 = int32x8_load(&x[i + 8]); + int32x8 a2 = int32x8_load(&x[i + 16]); + int32x8 a3 = int32x8_load(&x[i + 24]); + int32x8 a4 = int32x8_load(&x[i + 32]); + int32x8 a5 = int32x8_load(&x[i + 40]); + int32x8 a6 = int32x8_load(&x[i + 48]); + int32x8 a7 = int32x8_load(&x[i + 56]); + + int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */ + int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */ + int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */ + int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */ + int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */ + int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */ + int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */ + + if (flagdown) { + c2 ^= mask; + c3 ^= mask; + c6 ^= mask; + c7 ^= mask; + } else { + c0 ^= mask; + c1 ^= mask; + c4 ^= mask; + c5 ^= mask; + } + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */ + int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */ + int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */ + + int32x8_MINMAX(d0, d1); + int32x8_MINMAX(d2, d3); + int32x8_MINMAX(d4, d5); + int32x8_MINMAX(d6, d7); + int32x8_MINMAX(d0, d2); + int32x8_MINMAX(d1, d3); + int32x8_MINMAX(d4, d6); + int32x8_MINMAX(d5, d7); + int32x8_MINMAX(d0, d4); + int32x8_MINMAX(d1, d5); + int32x8_MINMAX(d2, d6); + int32x8_MINMAX(d3, d7); + + int32x8 e0 = _mm256_unpacklo_epi32(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi32(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi32(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi32(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi32(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi32(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi32(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi32(d6, d7); + + int32x8 f0 = _mm256_unpacklo_epi64(e0, e2); + int32x8 f1 = _mm256_unpacklo_epi64(e1, e3); + int32x8 f2 = _mm256_unpackhi_epi64(e0, e2); + int32x8 f3 = _mm256_unpackhi_epi64(e1, e3); + int32x8 f4 = _mm256_unpacklo_epi64(e4, e6); + int32x8 f5 = _mm256_unpacklo_epi64(e5, e7); + int32x8 f6 = _mm256_unpackhi_epi64(e4, e6); + int32x8 f7 = _mm256_unpackhi_epi64(e5, e7); + + int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20); + int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20); + int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20); + int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20); + int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31); + int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31); + int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31); + int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31); + + int32x8_store(&x[i], g0); + int32x8_store(&x[i + 8], g1); + int32x8_store(&x[i + 16], g2); + int32x8_store(&x[i + 24], g3); + int32x8_store(&x[i + 32], g4); + int32x8_store(&x[i + 40], g5); + int32x8_store(&x[i + 48], g6); + int32x8_store(&x[i + 56], g7); + } + + q = n >> 4; + while (q >= 128 || q == 32) { + q >>= 2; + for (j = 0; j < n; j += 8 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q >>= 1; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += q + q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (i = 0; i < q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */ + int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */ + int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */ + int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */ + int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */ + int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */ + int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */ + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */ + int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */ + int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */ + + if (flagdown) { + d0 ^= mask; + d1 ^= mask; + d2 ^= mask; + d3 ^= mask; + d4 ^= mask; + d5 ^= mask; + d6 ^= mask; + d7 ^= mask; + } + + int32x8_store(&x[i], d0); + int32x8_store(&x[i + q], d4); + int32x8_store(&x[i + 2 * q], d1); + int32x8_store(&x[i + 3 * q], d5); + int32x8_store(&x[i + 4 * q], d2); + int32x8_store(&x[i + 5 * q], d6); + int32x8_store(&x[i + 6 * q], d3); + int32x8_store(&x[i + 7 * q], d7); + } +} + +void PQCLEAN_MCELIECE348864F_AVX_int32_sort(int32 *x, size_t n) { + size_t q, i, j; + + if (n <= 8) { + if (n == 8) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + int32_MINMAX(&x[6], &x[7]); + } + if (n >= 7) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + } + if (n >= 6) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + } + if (n >= 5) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + } + if (n >= 4) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + } + if (n >= 3) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + } + if (n >= 2) { + int32_MINMAX(&x[0], &x[1]); + } + return; + } + + if (!(n & (n - 1))) { + int32_sort_2power(x, n, 0); + return; + } + + q = 8; + while (q < n - q) { + q += q; + } + /* n > q >= 8 */ + + if (q <= 128) { /* n <= 256 */ + int32x8 y[32]; + for (i = q >> 3; i < q >> 2; ++i) { + y[i] = _mm256_set1_epi32(0x7fffffff); + } + for (i = 0; i < n; ++i) { + ((int32 *)y)[i] = x[i]; + } + int32_sort_2power((int32 *) y, 2 * q, 0); + for (i = 0; i < n; ++i) { + x[i] = ((int32 *) y)[i]; + } + return; + } + + int32_sort_2power(x, q, 1); + PQCLEAN_MCELIECE348864F_AVX_int32_sort(x + q, n - q); + + while (q >= 64) { + q >>= 2; + j = int32_threestages(x, n, q); + minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j); + if (j + 4 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + j += 4 * q; + } + minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j); + if (j + 2 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + } + j += 2 * q; + } + minmax_vector(x + j, x + j + q, n - q - j); + q >>= 1; + } + if (q == 32) { + j = 0; + for (; j + 64 <= n; j += 64) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8 x4 = int32x8_load(&x[j + 32]); + int32x8 x5 = int32x8_load(&x[j + 40]); + int32x8 x6 = int32x8_load(&x[j + 48]); + int32x8 x7 = int32x8_load(&x[j + 56]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20); + int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31); + int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20); + int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8_MINMAX(a4, a5); + int32x8_MINMAX(a6, a7); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20); + int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31); + int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20); + int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8 c4 = _mm256_unpacklo_epi64(b4, b5); + int32x8 c5 = _mm256_unpackhi_epi64(b4, b5); + int32x8 c6 = _mm256_unpacklo_epi64(b6, b7); + int32x8 c7 = _mm256_unpackhi_epi64(b6, b7); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8_MINMAX(c4, c5); + int32x8_MINMAX(c6, c7); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 d4 = _mm256_unpacklo_epi32(c4, c5); + int32x8 d5 = _mm256_unpackhi_epi32(c4, c5); + int32x8 d6 = _mm256_unpacklo_epi32(c6, c7); + int32x8 d7 = _mm256_unpackhi_epi32(c6, c7); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi64(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi64(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi64(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi64(d6, d7); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8_MINMAX(e4, e5); + int32x8_MINMAX(e6, e7); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8 f4 = _mm256_unpacklo_epi32(e4, e5); + int32x8 f5 = _mm256_unpackhi_epi32(e4, e5); + int32x8 f6 = _mm256_unpacklo_epi32(e6, e7); + int32x8 f7 = _mm256_unpackhi_epi32(e6, e7); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + int32x8_store(&x[j + 32], f4); + int32x8_store(&x[j + 40], f5); + int32x8_store(&x[j + 48], f6); + int32x8_store(&x[j + 56], f7); + } + minmax_vector(x + j, x + j + 32, n - 32 - j); + goto continue16; + } + if (q == 16) { + j = 0; +continue16: + for (; j + 32 <= n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + } + minmax_vector(x + j, x + j + 16, n - 16 - j); + goto continue8; + } + /* q == 8 */ + j = 0; +continue8: + for (; j + 16 <= n; j += 16) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 8], x1); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */ + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */ + int32x8_MINMAX(a0, a1); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */ + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */ + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */ + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */ + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */ + int32x8_MINMAX(e0, e1); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */ + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */ + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + } + minmax_vector(x + j, x + j + 8, n - 8 - j); + if (j + 8 <= n) { + int32_MINMAX(&x[j], &x[j + 4]); + int32_MINMAX(&x[j + 1], &x[j + 5]); + int32_MINMAX(&x[j + 2], &x[j + 6]); + int32_MINMAX(&x[j + 3], &x[j + 7]); + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + int32_MINMAX(&x[j + 4], &x[j + 6]); + int32_MINMAX(&x[j + 5], &x[j + 7]); + int32_MINMAX(&x[j + 4], &x[j + 5]); + int32_MINMAX(&x[j + 6], &x[j + 7]); + j += 8; + } + minmax_vector(x + j, x + j + 4, n - 4 - j); + if (j + 4 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + j += 4; + } + if (j + 3 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + } + if (j + 2 <= n) { + int32_MINMAX(&x[j], &x[j + 1]); + } +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/int32_sort.h b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/int32_sort.h new file mode 100644 index 0000000000..3d07fbfb5d --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/int32_sort.h @@ -0,0 +1,9 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_INT32_SORT_H +#define PQCLEAN_MCELIECE348864F_AVX_INT32_SORT_H + +#include +#include + +void PQCLEAN_MCELIECE348864F_AVX_int32_sort(int32_t *x, size_t n); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/operations.c b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/operations.c new file mode 100644 index 0000000000..cefb5d6b44 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE348864F_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE348864F_AVX_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE348864F_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE348864F_AVX_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE348864F_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE348864F_AVX_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE348864F_AVX_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE348864F_AVX_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE348864F_AVX_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE348864F_AVX_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE348864F_AVX_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE348864F_AVX_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE348864F_AVX_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/params.h b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/params.h new file mode 100644 index 0000000000..c0474ae1b8 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_PARAMS_H +#define PQCLEAN_MCELIECE348864F_AVX_PARAMS_H + +#define GFBITS 12 +#define SYS_N 3488 +#define SYS_T 64 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/pk_gen.c b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/pk_gen.c new file mode 100644 index 0000000000..0d61f3606d --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/pk_gen.c @@ -0,0 +1,329 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "transpose.h" +#include "util.h" + +#include + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void de_bitslicing(uint64_t *out, vec256 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 16; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 256 + 0 * 64 + r] <<= 1; + out[i * 256 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 256 + 1 * 64 + r] <<= 1; + out[i * 256 + 1 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[i][j], 2); + for (r = 0; r < 64; r++) { + out[i * 256 + 2 * 64 + r] <<= 1; + out[i * 256 + 2 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(in[i][j], 3); + for (r = 0; r < 64; r++) { + out[i * 256 + 3 * 64 + r] <<= 1; + out[i * 256 + 3 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec256 out0[][GFBITS], vec256 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[4] = {0}; + + for (i = 0; i < 16; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + return (int)_tzcnt_u64(in); +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint64_t mat[][ ((SYS_N + 255) / 256) * 4 ], uint32_t *perm) { + int i, j, k, s, block_idx, row; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 64; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = (mat[ row + i ][ block_idx + 0 ] >> 32) | + (mat[ row + i ][ block_idx + 1 ] << 32); + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask(k, ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = (mat[ i + j ][ block_idx + 0 ] >> 32) | + (mat[ i + j ][ block_idx + 1 ] << 32); + } + + PQCLEAN_MCELIECE348864F_AVX_transpose_64x64(buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask(k, ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + PQCLEAN_MCELIECE348864F_AVX_transpose_64x64(buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + mat[ i + j ][ block_idx + 0 ] = (mat[ i + j ][ block_idx + 0 ] << 32 >> 32) | (buf[j] << 32); + mat[ i + j ][ block_idx + 1 ] = (mat[ i + j ][ block_idx + 1 ] >> 32 << 32) | (buf[j] >> 32); + } + } + + return 0; +} + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 255) / 256) +#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) +int PQCLEAN_MCELIECE348864F_AVX_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 4 ]; + + uint64_t mask; + + uint64_t sk_int[ GFBITS ]; + + vec256 consts[ 16 ][ GFBITS ]; + vec256 eval[ 16 ][ GFBITS ]; + vec256 prod[ 16 ][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + + // compute the inverses + + PQCLEAN_MCELIECE348864F_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE348864F_AVX_fft(eval, sk_int); + + PQCLEAN_MCELIECE348864F_AVX_vec256_copy(prod[0], eval[0]); + + for (i = 1; i < 16; i++) { + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE348864F_AVX_vec256_inv(tmp, prod[15]); + + for (i = 14; i >= 0; i--) { + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE348864F_AVX_vec256_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE348864F_AVX_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE348864F_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE348864F_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + // gaussian elimination + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < row; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + + for (i = 0; i < GFBITS * SYS_T; i++) { + for (j = NBLOCKS1_I; j < NBLOCKS1_H - 1; j++) { + PQCLEAN_MCELIECE348864F_AVX_store8(pk, mat[i][j]); + pk += 8; + } + + PQCLEAN_MCELIECE348864F_AVX_store_i(pk, mat[i][j], PK_ROW_BYTES % 8); + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/pk_gen.h b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/pk_gen.h new file mode 100644 index 0000000000..e8a1f9db44 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/pk_gen.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_PK_GEN_H +#define PQCLEAN_MCELIECE348864F_AVX_PK_GEN_H +/* + This file is for public-key generation +*/ + + +#include "gf.h" + +int PQCLEAN_MCELIECE348864F_AVX_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/powers.inc b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/powers.inc new file mode 100644 index 0000000000..cb21ce5bcd --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/powers.inc @@ -0,0 +1,224 @@ +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC, 0x3333CCCC3333CCCC), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F, 0xF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0000000000000000, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA, 0xAA55AA55AA55AA55, 0x55AA55AA55AA55AA), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0, 0x0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333, 0xCCCC3333CCCC3333), + PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555), +}, diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/scalars.inc b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/scalars.inc new file mode 100644 index 0000000000..aa8f64b951 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/scalars.inc @@ -0,0 +1,70 @@ +{ + 0XF3CFC030FC30F003, + 0X3FCF0F003C00C00C, + 0X30033CC300C0C03C, + 0XCCFF0F3C0F30F0C0, + 0X0300C03FF303C3F0, + 0X3FFF3C0FF0CCCCC0, + 0XF3FFF0C00F3C3CC0, + 0X3003333FFFC3C000, + 0X0FF30FFFC3FFF300, + 0XFFC0F300F0F0CC00, + 0XC0CFF3FCCC3CFC00, + 0XFC3C03F0F330C000, +}, +{ + 0X000F00000000F00F, + 0X00000F00F00000F0, + 0X0F00000F00000F00, + 0XF00F00F00F000000, + 0X00F00000000000F0, + 0X0000000F00000000, + 0XF00000000F00F000, + 0X00F00F00000F0000, + 0X0000F00000F00F00, + 0X000F00F00F00F000, + 0X00F00F0000000000, + 0X0000000000F00000, +}, +{ + 0X0000FF00FF0000FF, + 0X0000FF000000FF00, + 0XFF0000FF00FF0000, + 0XFFFF0000FF000000, + 0X00FF00FF00FF0000, + 0X0000FFFFFF000000, + 0X00FFFF00FF000000, + 0XFFFFFF0000FF0000, + 0XFFFF00FFFF00FF00, + 0X0000FF0000000000, + 0XFFFFFF00FF000000, + 0X00FF000000000000, +}, +{ + 0X000000000000FFFF, + 0X00000000FFFF0000, + 0X0000000000000000, + 0XFFFF000000000000, + 0X00000000FFFF0000, + 0X0000FFFF00000000, + 0X0000000000000000, + 0X00000000FFFF0000, + 0X0000FFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, +}, +{ + 0X00000000FFFFFFFF, + 0XFFFFFFFF00000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0XFFFFFFFF00000000, + 0X0000000000000000, + 0X0000000000000000, + 0X0000000000000000, +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/scalars_2x.inc b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/scalars_2x.inc new file mode 100644 index 0000000000..5d690ca210 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/scalars_2x.inc @@ -0,0 +1,70 @@ +{ + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xf3cfc030fc30f003, 0x000c03c0c3c0330c), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x3fcf0f003c00c00c, 0xf330cffcc00f33c0), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x30033cc300c0c03c, 0xccf330f00f3c0333), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xccff0f3c0f30f0c0, 0xff03fff3ff0cf0c0), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0300c03ff303c3f0, 0x3cc3fcf00fcc303c), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x3fff3c0ff0ccccc0, 0x0f000c0fc30303f3), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xf3fff0c00f3c3cc0, 0xcf0fc3ff333ccf3c), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x3003333fffc3c000, 0x003f3fc3c0ff333f), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0ff30fffc3fff300, 0x3cc3f0f3cf0ff00f), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xffc0f300f0f0cc00, 0xf3f33cc03fc30cc0), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xc0cff3fccc3cfc00, 0x3cc330cfc333f33f), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xfc3c03f0f330c000, 0x3cc0303ff3c3fffc), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x000f00000000f00f, 0x0f00f00f00000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00000f00f00000f0, 0xf00000000000f000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0f00000f00000f00, 0x00000f00000000f0), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xf00f00f00f000000, 0x0f00f00000f00000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00f00000000000f0, 0x000f00000f00f00f), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000f00000000, 0x00f00f00f00f0000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xf00000000f00f000, 0x0f00f00000000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00f00f00000f0000, 0x000000000f000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000f00000f00f00, 0x00f00000000f00f0), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x000f00f00f00f000, 0x0000f00f00000f00), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00f00f0000000000, 0xf00000f00000f00f), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000f00000, 0x00000f00f00f00f0), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000ff00ff0000ff, 0xff00ffffff000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000ff000000ff00, 0xff0000ffff000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xff0000ff00ff0000, 0xffff00ffff000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xffff0000ff000000, 0xff00ffffffffff00), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00ff00ff00ff0000, 0x00000000ff00ff00), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000ffffff000000, 0xffffffff00ff0000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00ffff00ff000000, 0x00ffffff00ff0000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xffffff0000ff0000, 0xffff00ffff00ffff), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xffff00ffff00ff00, 0xffff0000ffffffff), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000ff0000000000, 0xff00000000ff0000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xffffff00ff000000, 0x000000ff00ff00ff), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00ff000000000000, 0x00ff00ff00ffff00), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x000000000000ffff, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00000000ffff0000, 0xffff000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xffff000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00000000ffff0000, 0xffff00000000ffff), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000ffff00000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000000000, 0x0000ffff00000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00000000ffff0000, 0xffff00000000ffff), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000ffff00000000, 0x00000000ffff0000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000000000, 0xffff00000000ffff), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000000000, 0x00000000ffff0000), +}, +{ + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x00000000ffffffff, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xffffffff00000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xffffffff00000000, 0x00000000ffffffff), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xffffffff00000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000000000, 0xffffffffffffffff), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0xffffffff00000000, 0xffffffff00000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000000000, 0x0000000000000000), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000000000, 0xffffffffffffffff), + PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(0x0000000000000000, 0xffffffff00000000), +}, diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/sk_gen.c b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/sk_gen.c new file mode 100644 index 0000000000..29a7f7668e --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE348864F_AVX_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE348864F_AVX_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE348864F_AVX_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE348864F_AVX_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE348864F_AVX_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE348864F_AVX_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE348864F_AVX_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE348864F_AVX_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/sk_gen.h b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/sk_gen.h new file mode 100644 index 0000000000..fd47ebb45b --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_SK_GEN_H +#define PQCLEAN_MCELIECE348864F_AVX_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE348864F_AVX_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE348864F_AVX_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/syndrome_asm.S b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/syndrome_asm.S new file mode 100644 index 0000000000..06cc0f2e59 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/syndrome_asm.S @@ -0,0 +1,530 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: reg256 pp + +# qhasm: reg256 ee + +# qhasm: reg256 ss + +# qhasm: int64 buf_ptr + +# qhasm: stack256 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864F_AVX_syndrome_asm +.global PQCLEAN_MCELIECE348864F_AVX_syndrome_asm +_PQCLEAN_MCELIECE348864F_AVX_syndrome_asm: +PQCLEAN_MCELIECE348864F_AVX_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 260780 +# asm 1: add $260780,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 768 +# asm 1: mov $768,>row=int64#5 +# asm 2: mov $768,>row=%r8 +mov $768,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 96 ] +# asm 1: vmovupd 96(ee=reg256#2 +# asm 2: vmovupd 96(ee=%ymm1 +vmovupd 96(%rdx),%ymm1 + +# qhasm: ss &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 32(pp=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 128 ] +# asm 1: vmovupd 128(ee=reg256#3 +# asm 2: vmovupd 128(ee=%ymm2 +vmovupd 128(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 64(pp=%ymm1 +vmovupd 64(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 160 ] +# asm 1: vmovupd 160(ee=reg256#3 +# asm 2: vmovupd 160(ee=%ymm2 +vmovupd 160(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 96(pp=%ymm1 +vmovupd 96(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 192 ] +# asm 1: vmovupd 192(ee=reg256#3 +# asm 2: vmovupd 192(ee=%ymm2 +vmovupd 192(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 128(pp=%ymm1 +vmovupd 128(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 224 ] +# asm 1: vmovupd 224(ee=reg256#3 +# asm 2: vmovupd 224(ee=%ymm2 +vmovupd 224(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 160(pp=%ymm1 +vmovupd 160(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 256 ] +# asm 1: vmovupd 256(ee=reg256#3 +# asm 2: vmovupd 256(ee=%ymm2 +vmovupd 256(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 192(pp=%ymm1 +vmovupd 192(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 288 ] +# asm 1: vmovupd 288(ee=reg256#3 +# asm 2: vmovupd 288(ee=%ymm2 +vmovupd 288(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 224(pp=%ymm1 +vmovupd 224(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 320 ] +# asm 1: vmovupd 320(ee=reg256#3 +# asm 2: vmovupd 320(ee=%ymm2 +vmovupd 320(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 256(pp=%ymm1 +vmovupd 256(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 352 ] +# asm 1: vmovupd 352(ee=reg256#3 +# asm 2: vmovupd 352(ee=%ymm2 +vmovupd 352(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 288(pp=%ymm1 +vmovupd 288(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 384 ] +# asm 1: vmovupd 384(ee=reg256#3 +# asm 2: vmovupd 384(ee=%ymm2 +vmovupd 384(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand buf=stack256#1 +# asm 2: vmovapd buf=0(%rsp) +vmovapd %ymm0,0(%rsp) + +# qhasm: s = *(uint64 *)(input_1 + 320) +# asm 1: movq 320(s=int64#6 +# asm 2: movq 320(s=%r9 +movq 320(%rsi),%r9 + +# qhasm: e = *(uint64 *)(input_2 + 416) +# asm 1: movq 416(e=int64#7 +# asm 2: movq 416(e=%rax +movq 416(%rdx),%rax + +# qhasm: s &= e +# asm 1: and p=int64#7 +# asm 2: movq 328(p=%rax +movq 328(%rsi),%rax + +# qhasm: e = *(uint64 *)(input_2 + 424) +# asm 1: movq 424(e=int64#8 +# asm 2: movq 424(e=%r10 +movq 424(%rdx),%r10 + +# qhasm: p &= e +# asm 1: and p=int64#7d +# asm 2: movl 336(p=%eax +movl 336(%rsi),%eax + +# qhasm: e = *(uint32 *)(input_2 + 432) +# asm 1: movl 432(e=int64#8d +# asm 2: movl 432(e=%r10d +movl 432(%rdx),%r10d + +# qhasm: p &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 16(b64=%rax +movq 16(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 24(b64=%rax +movq 24(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(ee=reg256#2 +# asm 2: vmovupd 0(ee=%ymm1 +vmovupd 0(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 32(ss=%ymm0 +vmovupd 32(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(ee=reg256#2 +# asm 2: vmovupd 32(ee=%ymm1 +vmovupd 32(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 64(ss=%ymm0 +vmovupd 64(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(ee=reg256#2 +# asm 2: vmovupd 64(ee=%ymm1 +vmovupd 64(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor + + +void PQCLEAN_MCELIECE348864F_AVX_transpose_64x64(uint64_t *in); +void PQCLEAN_MCELIECE348864F_AVX_transpose_64x256_sp(vec256 *in); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/transpose_64x256_sp_asm.S b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/transpose_64x256_sp_asm.S new file mode 100644 index 0000000000..17c4412b78 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/transpose_64x256_sp_asm.S @@ -0,0 +1,8145 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 x0 + +# qhasm: reg256 x1 + +# qhasm: reg256 x2 + +# qhasm: reg256 x3 + +# qhasm: reg256 x4 + +# qhasm: reg256 x5 + +# qhasm: reg256 x6 + +# qhasm: reg256 x7 + +# qhasm: reg256 t0 + +# qhasm: reg256 t1 + +# qhasm: reg256 v00 + +# qhasm: reg256 v01 + +# qhasm: reg256 v10 + +# qhasm: reg256 v11 + +# qhasm: reg256 mask0 + +# qhasm: reg256 mask1 + +# qhasm: reg256 mask2 + +# qhasm: reg256 mask3 + +# qhasm: reg256 mask4 + +# qhasm: reg256 mask5 + +# qhasm: enter transpose_64x256_sp_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864F_AVX_transpose_64x256_sp_asm +.global PQCLEAN_MCELIECE348864F_AVX_transpose_64x256_sp_asm +_PQCLEAN_MCELIECE348864F_AVX_transpose_64x256_sp_asm: +PQCLEAN_MCELIECE348864F_AVX_transpose_64x256_sp_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: mask0 aligned= mem256[ PQCLEAN_MCELIECE348864F_AVX_MASK5_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK5_0,>mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK5_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK5_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE348864F_AVX_MASK5_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK5_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK5_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK5_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE348864F_AVX_MASK4_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK4_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK4_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK4_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE348864F_AVX_MASK4_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK4_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK4_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK4_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE348864F_AVX_MASK3_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK3_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK3_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK3_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE348864F_AVX_MASK3_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK3_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK3_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK3_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(x1=reg256#8 +# asm 2: vmovupd 256(x1=%ymm7 +vmovupd 256(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 512 ] +# asm 1: vmovupd 512(x2=reg256#9 +# asm 2: vmovupd 512(x2=%ymm8 +vmovupd 512(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 768 ] +# asm 1: vmovupd 768(x3=reg256#10 +# asm 2: vmovupd 768(x3=%ymm9 +vmovupd 768(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1024 ] +# asm 1: vmovupd 1024(x4=reg256#11 +# asm 2: vmovupd 1024(x4=%ymm10 +vmovupd 1024(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1280 ] +# asm 1: vmovupd 1280(x5=reg256#12 +# asm 2: vmovupd 1280(x5=%ymm11 +vmovupd 1280(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1536 ] +# asm 1: vmovupd 1536(x6=reg256#13 +# asm 2: vmovupd 1536(x6=%ymm12 +vmovupd 1536(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1792 ] +# asm 1: vmovupd 1792(x7=reg256#14 +# asm 2: vmovupd 1792(x7=%ymm13 +vmovupd 1792(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 32(x0=%ymm6 +vmovupd 32(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x2=reg256#9 +# asm 2: vmovupd 544(x2=%ymm8 +vmovupd 544(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x3=reg256#10 +# asm 2: vmovupd 800(x3=%ymm9 +vmovupd 800(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x4=reg256#11 +# asm 2: vmovupd 1056(x4=%ymm10 +vmovupd 1056(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x5=reg256#12 +# asm 2: vmovupd 1312(x5=%ymm11 +vmovupd 1312(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x6=reg256#13 +# asm 2: vmovupd 1568(x6=%ymm12 +vmovupd 1568(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x7=reg256#14 +# asm 2: vmovupd 1824(x7=%ymm13 +vmovupd 1824(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 32 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 64(x0=%ymm6 +vmovupd 64(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x1=reg256#8 +# asm 2: vmovupd 320(x1=%ymm7 +vmovupd 320(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x3=reg256#10 +# asm 2: vmovupd 832(x3=%ymm9 +vmovupd 832(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x4=reg256#11 +# asm 2: vmovupd 1088(x4=%ymm10 +vmovupd 1088(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x5=reg256#12 +# asm 2: vmovupd 1344(x5=%ymm11 +vmovupd 1344(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x6=reg256#13 +# asm 2: vmovupd 1600(x6=%ymm12 +vmovupd 1600(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x7=reg256#14 +# asm 2: vmovupd 1856(x7=%ymm13 +vmovupd 1856(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 64 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 96(x0=%ymm6 +vmovupd 96(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x1=reg256#8 +# asm 2: vmovupd 352(x1=%ymm7 +vmovupd 352(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x2=reg256#9 +# asm 2: vmovupd 608(x2=%ymm8 +vmovupd 608(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x4=reg256#11 +# asm 2: vmovupd 1120(x4=%ymm10 +vmovupd 1120(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x5=reg256#12 +# asm 2: vmovupd 1376(x5=%ymm11 +vmovupd 1376(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x6=reg256#13 +# asm 2: vmovupd 1632(x6=%ymm12 +vmovupd 1632(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x7=reg256#14 +# asm 2: vmovupd 1888(x7=%ymm13 +vmovupd 1888(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 96 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 128(x0=%ymm6 +vmovupd 128(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x1=reg256#8 +# asm 2: vmovupd 384(x1=%ymm7 +vmovupd 384(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x2=reg256#9 +# asm 2: vmovupd 640(x2=%ymm8 +vmovupd 640(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x3=reg256#10 +# asm 2: vmovupd 896(x3=%ymm9 +vmovupd 896(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x5=reg256#12 +# asm 2: vmovupd 1408(x5=%ymm11 +vmovupd 1408(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x6=reg256#13 +# asm 2: vmovupd 1664(x6=%ymm12 +vmovupd 1664(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x7=reg256#14 +# asm 2: vmovupd 1920(x7=%ymm13 +vmovupd 1920(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 128 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 160(x0=%ymm6 +vmovupd 160(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x1=reg256#8 +# asm 2: vmovupd 416(x1=%ymm7 +vmovupd 416(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x2=reg256#9 +# asm 2: vmovupd 672(x2=%ymm8 +vmovupd 672(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x3=reg256#10 +# asm 2: vmovupd 928(x3=%ymm9 +vmovupd 928(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x4=reg256#11 +# asm 2: vmovupd 1184(x4=%ymm10 +vmovupd 1184(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x6=reg256#13 +# asm 2: vmovupd 1696(x6=%ymm12 +vmovupd 1696(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x7=reg256#14 +# asm 2: vmovupd 1952(x7=%ymm13 +vmovupd 1952(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 160 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 192(x0=%ymm6 +vmovupd 192(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x1=reg256#8 +# asm 2: vmovupd 448(x1=%ymm7 +vmovupd 448(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x2=reg256#9 +# asm 2: vmovupd 704(x2=%ymm8 +vmovupd 704(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x3=reg256#10 +# asm 2: vmovupd 960(x3=%ymm9 +vmovupd 960(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x4=reg256#11 +# asm 2: vmovupd 1216(x4=%ymm10 +vmovupd 1216(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x5=reg256#12 +# asm 2: vmovupd 1472(x5=%ymm11 +vmovupd 1472(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x7=reg256#14 +# asm 2: vmovupd 1984(x7=%ymm13 +vmovupd 1984(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 192 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 224(x0=%ymm6 +vmovupd 224(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x1=reg256#8 +# asm 2: vmovupd 480(x1=%ymm7 +vmovupd 480(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x2=reg256#9 +# asm 2: vmovupd 736(x2=%ymm8 +vmovupd 736(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x3=reg256#10 +# asm 2: vmovupd 992(x3=%ymm9 +vmovupd 992(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x4=reg256#11 +# asm 2: vmovupd 1248(x4=%ymm10 +vmovupd 1248(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x5=reg256#12 +# asm 2: vmovupd 1504(x5=%ymm11 +vmovupd 1504(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x6=reg256#13 +# asm 2: vmovupd 1760(x6=%ymm12 +vmovupd 1760(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#1 +# asm 2: vpand v00=%ymm0 +vpand %ymm9,%ymm0,%ymm0 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#13 +# asm 2: vpsllq $32,v10=%ymm12 +vpsllq $32,%ymm13,%ymm12 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm0,%ymm12,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#13 +# asm 2: vpslld $16,v10=%ymm12 +vpslld $16,%ymm11,%ymm12 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#14 +# asm 2: vpsrld $16,v01=%ymm13 +vpsrld $16,%ymm14,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm0,%ymm13 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm8,%ymm13 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#3 +# asm 2: vpand v00=%ymm2 +vpand %ymm7,%ymm2,%ymm2 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#9 +# asm 2: vpslld $16,v10=%ymm8 +vpslld $16,%ymm1,%ymm8 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm2,%ymm8,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#8 +# asm 2: vpsllw $8,v10=%ymm7 +vpsllw $8,%ymm12,%ymm7 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#9 +# asm 2: vpsrlw $8,v01=%ymm8 +vpsrlw $8,%ymm9,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#10 +# asm 2: vpsllw $8,v10=%ymm9 +vpsllw $8,%ymm0,%ymm9 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#12 +# asm 2: vpsllw $8,v10=%ymm11 +vpsllw $8,%ymm2,%ymm11 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#11 +# asm 2: vpsrlw $8,v01=%ymm10 +vpsrlw $8,%ymm10,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#5 +# asm 2: vpand v00=%ymm4 +vpand %ymm6,%ymm4,%ymm4 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#11 +# asm 2: vpsllw $8,v10=%ymm10 +vpsllw $8,%ymm1,%ymm10 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm4,%ymm10,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 224 ] = x0 +# asm 1: vmovupd mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK2_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK2_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE348864F_AVX_MASK2_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK2_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK2_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK2_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE348864F_AVX_MASK1_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK1_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK1_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK1_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE348864F_AVX_MASK1_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK1_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK1_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK1_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE348864F_AVX_MASK0_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK0_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK0_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK0_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE348864F_AVX_MASK0_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK0_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK0_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE348864F_AVX_MASK0_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(x1=reg256#8 +# asm 2: vmovupd 32(x1=%ymm7 +vmovupd 32(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(x2=reg256#9 +# asm 2: vmovupd 64(x2=%ymm8 +vmovupd 64(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(x3=reg256#10 +# asm 2: vmovupd 96(x3=%ymm9 +vmovupd 96(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(x4=reg256#11 +# asm 2: vmovupd 128(x4=%ymm10 +vmovupd 128(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(x5=reg256#12 +# asm 2: vmovupd 160(x5=%ymm11 +vmovupd 160(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(x6=reg256#13 +# asm 2: vmovupd 192(x6=%ymm12 +vmovupd 192(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(x7=reg256#14 +# asm 2: vmovupd 224(x7=%ymm13 +vmovupd 224(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 256(x0=%ymm6 +vmovupd 256(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x2=reg256#9 +# asm 2: vmovupd 320(x2=%ymm8 +vmovupd 320(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x3=reg256#10 +# asm 2: vmovupd 352(x3=%ymm9 +vmovupd 352(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x4=reg256#11 +# asm 2: vmovupd 384(x4=%ymm10 +vmovupd 384(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x5=reg256#12 +# asm 2: vmovupd 416(x5=%ymm11 +vmovupd 416(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x6=reg256#13 +# asm 2: vmovupd 448(x6=%ymm12 +vmovupd 448(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x7=reg256#14 +# asm 2: vmovupd 480(x7=%ymm13 +vmovupd 480(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 256 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 512(x0=%ymm6 +vmovupd 512(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x1=reg256#8 +# asm 2: vmovupd 544(x1=%ymm7 +vmovupd 544(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x3=reg256#10 +# asm 2: vmovupd 608(x3=%ymm9 +vmovupd 608(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x4=reg256#11 +# asm 2: vmovupd 640(x4=%ymm10 +vmovupd 640(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x5=reg256#12 +# asm 2: vmovupd 672(x5=%ymm11 +vmovupd 672(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x6=reg256#13 +# asm 2: vmovupd 704(x6=%ymm12 +vmovupd 704(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x7=reg256#14 +# asm 2: vmovupd 736(x7=%ymm13 +vmovupd 736(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 512 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 768(x0=%ymm6 +vmovupd 768(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x1=reg256#8 +# asm 2: vmovupd 800(x1=%ymm7 +vmovupd 800(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x2=reg256#9 +# asm 2: vmovupd 832(x2=%ymm8 +vmovupd 832(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x4=reg256#11 +# asm 2: vmovupd 896(x4=%ymm10 +vmovupd 896(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x5=reg256#12 +# asm 2: vmovupd 928(x5=%ymm11 +vmovupd 928(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x6=reg256#13 +# asm 2: vmovupd 960(x6=%ymm12 +vmovupd 960(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x7=reg256#14 +# asm 2: vmovupd 992(x7=%ymm13 +vmovupd 992(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 768 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1024(x0=%ymm6 +vmovupd 1024(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x1=reg256#8 +# asm 2: vmovupd 1056(x1=%ymm7 +vmovupd 1056(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x2=reg256#9 +# asm 2: vmovupd 1088(x2=%ymm8 +vmovupd 1088(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x3=reg256#10 +# asm 2: vmovupd 1120(x3=%ymm9 +vmovupd 1120(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x5=reg256#12 +# asm 2: vmovupd 1184(x5=%ymm11 +vmovupd 1184(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x6=reg256#13 +# asm 2: vmovupd 1216(x6=%ymm12 +vmovupd 1216(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x7=reg256#14 +# asm 2: vmovupd 1248(x7=%ymm13 +vmovupd 1248(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1024 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1280(x0=%ymm6 +vmovupd 1280(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x1=reg256#8 +# asm 2: vmovupd 1312(x1=%ymm7 +vmovupd 1312(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x2=reg256#9 +# asm 2: vmovupd 1344(x2=%ymm8 +vmovupd 1344(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x3=reg256#10 +# asm 2: vmovupd 1376(x3=%ymm9 +vmovupd 1376(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x4=reg256#11 +# asm 2: vmovupd 1408(x4=%ymm10 +vmovupd 1408(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x6=reg256#13 +# asm 2: vmovupd 1472(x6=%ymm12 +vmovupd 1472(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x7=reg256#14 +# asm 2: vmovupd 1504(x7=%ymm13 +vmovupd 1504(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1280 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1536(x0=%ymm6 +vmovupd 1536(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x1=reg256#8 +# asm 2: vmovupd 1568(x1=%ymm7 +vmovupd 1568(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x2=reg256#9 +# asm 2: vmovupd 1600(x2=%ymm8 +vmovupd 1600(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x3=reg256#10 +# asm 2: vmovupd 1632(x3=%ymm9 +vmovupd 1632(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x4=reg256#11 +# asm 2: vmovupd 1664(x4=%ymm10 +vmovupd 1664(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x5=reg256#12 +# asm 2: vmovupd 1696(x5=%ymm11 +vmovupd 1696(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x7=reg256#14 +# asm 2: vmovupd 1760(x7=%ymm13 +vmovupd 1760(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1536 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1792(x0=%ymm6 +vmovupd 1792(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x1=reg256#8 +# asm 2: vmovupd 1824(x1=%ymm7 +vmovupd 1824(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x2=reg256#9 +# asm 2: vmovupd 1856(x2=%ymm8 +vmovupd 1856(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x3=reg256#10 +# asm 2: vmovupd 1888(x3=%ymm9 +vmovupd 1888(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x4=reg256#11 +# asm 2: vmovupd 1920(x4=%ymm10 +vmovupd 1920(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x5=reg256#12 +# asm 2: vmovupd 1952(x5=%ymm11 +vmovupd 1952(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x6=reg256#13 +# asm 2: vmovupd 1984(x6=%ymm12 +vmovupd 1984(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#1 +# asm 2: vpand v10=%ymm0 +vpand %ymm13,%ymm0,%ymm0 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm12,%ymm0,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#13 +# asm 2: vpand v10=%ymm12 +vpand %ymm11,%ymm2,%ymm12 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm14,%ymm3,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm0,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm8,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#3 +# asm 2: vpand v10=%ymm2 +vpand %ymm1,%ymm2,%ymm2 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm8,%ymm2,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#8 +# asm 2: vpand v10=%ymm7 +vpand %ymm12,%ymm4,%ymm7 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm9,%ymm5,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#10 +# asm 2: vpand v10=%ymm9 +vpand %ymm0,%ymm4,%ymm9 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#12 +# asm 2: vpand v10=%ymm11 +vpand %ymm2,%ymm4,%ymm11 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm4,%ymm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#5 +# asm 2: vpand v10=%ymm4 +vpand %ymm1,%ymm4,%ymm4 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm10,%ymm4,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 1792 ] = x0 +# asm 1: vmovupd mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE348864F_AVX_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE348864F_AVX_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE348864F_AVX_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE348864F_AVX_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE348864F_AVX_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE348864F_AVX_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE348864F_AVX_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE348864F_AVX_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE348864F_AVX_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE348864F_AVX_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE348864F_AVX_MASK3_1(%rip),%xmm5 + +# qhasm: r0 = mem64[ input_0 + 0 ] x2 +# asm 1: movddup 0(r0=reg128#7 +# asm 2: movddup 0(r0=%xmm6 +movddup 0(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 64 ] x2 +# asm 1: movddup 64(r1=reg128#8 +# asm 2: movddup 64(r1=%xmm7 +movddup 64(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 128 ] x2 +# asm 1: movddup 128(r2=reg128#9 +# asm 2: movddup 128(r2=%xmm8 +movddup 128(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 192 ] x2 +# asm 1: movddup 192(r3=reg128#10 +# asm 2: movddup 192(r3=%xmm9 +movddup 192(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 256 ] x2 +# asm 1: movddup 256(r4=reg128#11 +# asm 2: movddup 256(r4=%xmm10 +movddup 256(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 320 ] x2 +# asm 1: movddup 320(r5=reg128#12 +# asm 2: movddup 320(r5=%xmm11 +movddup 320(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 384 ] x2 +# asm 1: movddup 384(r6=reg128#13 +# asm 2: movddup 384(r6=%xmm12 +movddup 384(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 448 ] x2 +# asm 1: movddup 448(r7=reg128#14 +# asm 2: movddup 448(r7=%xmm13 +movddup 448(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 0 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 64 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 128 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 192 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 256 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 320 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 384 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 448 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 8(r0=%xmm6 +movddup 8(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 72 ] x2 +# asm 1: movddup 72(r1=reg128#8 +# asm 2: movddup 72(r1=%xmm7 +movddup 72(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 136 ] x2 +# asm 1: movddup 136(r2=reg128#9 +# asm 2: movddup 136(r2=%xmm8 +movddup 136(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 200 ] x2 +# asm 1: movddup 200(r3=reg128#10 +# asm 2: movddup 200(r3=%xmm9 +movddup 200(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 264 ] x2 +# asm 1: movddup 264(r4=reg128#11 +# asm 2: movddup 264(r4=%xmm10 +movddup 264(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 328 ] x2 +# asm 1: movddup 328(r5=reg128#12 +# asm 2: movddup 328(r5=%xmm11 +movddup 328(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 392 ] x2 +# asm 1: movddup 392(r6=reg128#13 +# asm 2: movddup 392(r6=%xmm12 +movddup 392(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 456 ] x2 +# asm 1: movddup 456(r7=reg128#14 +# asm 2: movddup 456(r7=%xmm13 +movddup 456(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 8 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 72 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 136 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 200 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 264 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 328 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 392 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 456 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 16(r0=%xmm6 +movddup 16(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 80 ] x2 +# asm 1: movddup 80(r1=reg128#8 +# asm 2: movddup 80(r1=%xmm7 +movddup 80(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 144 ] x2 +# asm 1: movddup 144(r2=reg128#9 +# asm 2: movddup 144(r2=%xmm8 +movddup 144(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 208 ] x2 +# asm 1: movddup 208(r3=reg128#10 +# asm 2: movddup 208(r3=%xmm9 +movddup 208(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 272 ] x2 +# asm 1: movddup 272(r4=reg128#11 +# asm 2: movddup 272(r4=%xmm10 +movddup 272(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 336 ] x2 +# asm 1: movddup 336(r5=reg128#12 +# asm 2: movddup 336(r5=%xmm11 +movddup 336(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 400 ] x2 +# asm 1: movddup 400(r6=reg128#13 +# asm 2: movddup 400(r6=%xmm12 +movddup 400(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 464 ] x2 +# asm 1: movddup 464(r7=reg128#14 +# asm 2: movddup 464(r7=%xmm13 +movddup 464(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 16 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 80 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 144 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 208 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 272 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 336 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 400 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 464 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 24(r0=%xmm6 +movddup 24(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 88 ] x2 +# asm 1: movddup 88(r1=reg128#8 +# asm 2: movddup 88(r1=%xmm7 +movddup 88(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 152 ] x2 +# asm 1: movddup 152(r2=reg128#9 +# asm 2: movddup 152(r2=%xmm8 +movddup 152(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 216 ] x2 +# asm 1: movddup 216(r3=reg128#10 +# asm 2: movddup 216(r3=%xmm9 +movddup 216(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 280 ] x2 +# asm 1: movddup 280(r4=reg128#11 +# asm 2: movddup 280(r4=%xmm10 +movddup 280(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 344 ] x2 +# asm 1: movddup 344(r5=reg128#12 +# asm 2: movddup 344(r5=%xmm11 +movddup 344(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 408 ] x2 +# asm 1: movddup 408(r6=reg128#13 +# asm 2: movddup 408(r6=%xmm12 +movddup 408(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 472 ] x2 +# asm 1: movddup 472(r7=reg128#14 +# asm 2: movddup 472(r7=%xmm13 +movddup 472(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 24 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 88 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 152 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 216 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 280 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 344 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 408 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 472 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 32(r0=%xmm6 +movddup 32(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 96 ] x2 +# asm 1: movddup 96(r1=reg128#8 +# asm 2: movddup 96(r1=%xmm7 +movddup 96(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 160 ] x2 +# asm 1: movddup 160(r2=reg128#9 +# asm 2: movddup 160(r2=%xmm8 +movddup 160(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 224 ] x2 +# asm 1: movddup 224(r3=reg128#10 +# asm 2: movddup 224(r3=%xmm9 +movddup 224(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 288 ] x2 +# asm 1: movddup 288(r4=reg128#11 +# asm 2: movddup 288(r4=%xmm10 +movddup 288(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 352 ] x2 +# asm 1: movddup 352(r5=reg128#12 +# asm 2: movddup 352(r5=%xmm11 +movddup 352(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 416 ] x2 +# asm 1: movddup 416(r6=reg128#13 +# asm 2: movddup 416(r6=%xmm12 +movddup 416(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 480 ] x2 +# asm 1: movddup 480(r7=reg128#14 +# asm 2: movddup 480(r7=%xmm13 +movddup 480(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 32 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 96 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 160 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 224 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 288 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 352 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 416 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 480 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 40(r0=%xmm6 +movddup 40(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 104 ] x2 +# asm 1: movddup 104(r1=reg128#8 +# asm 2: movddup 104(r1=%xmm7 +movddup 104(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 168 ] x2 +# asm 1: movddup 168(r2=reg128#9 +# asm 2: movddup 168(r2=%xmm8 +movddup 168(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 232 ] x2 +# asm 1: movddup 232(r3=reg128#10 +# asm 2: movddup 232(r3=%xmm9 +movddup 232(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 296 ] x2 +# asm 1: movddup 296(r4=reg128#11 +# asm 2: movddup 296(r4=%xmm10 +movddup 296(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 360 ] x2 +# asm 1: movddup 360(r5=reg128#12 +# asm 2: movddup 360(r5=%xmm11 +movddup 360(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 424 ] x2 +# asm 1: movddup 424(r6=reg128#13 +# asm 2: movddup 424(r6=%xmm12 +movddup 424(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 488 ] x2 +# asm 1: movddup 488(r7=reg128#14 +# asm 2: movddup 488(r7=%xmm13 +movddup 488(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 40 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 104 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 168 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 232 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 296 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 360 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 424 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 488 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 48(r0=%xmm6 +movddup 48(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 112 ] x2 +# asm 1: movddup 112(r1=reg128#8 +# asm 2: movddup 112(r1=%xmm7 +movddup 112(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 176 ] x2 +# asm 1: movddup 176(r2=reg128#9 +# asm 2: movddup 176(r2=%xmm8 +movddup 176(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 240 ] x2 +# asm 1: movddup 240(r3=reg128#10 +# asm 2: movddup 240(r3=%xmm9 +movddup 240(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 304 ] x2 +# asm 1: movddup 304(r4=reg128#11 +# asm 2: movddup 304(r4=%xmm10 +movddup 304(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 368 ] x2 +# asm 1: movddup 368(r5=reg128#12 +# asm 2: movddup 368(r5=%xmm11 +movddup 368(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 432 ] x2 +# asm 1: movddup 432(r6=reg128#13 +# asm 2: movddup 432(r6=%xmm12 +movddup 432(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 496 ] x2 +# asm 1: movddup 496(r7=reg128#14 +# asm 2: movddup 496(r7=%xmm13 +movddup 496(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 48 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm13,%rsi + +# qhasm: mem64[ input_0 + 112 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm14,%rsi + +# qhasm: mem64[ input_0 + 176 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm10,%rsi + +# qhasm: mem64[ input_0 + 240 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm11,%rsi + +# qhasm: mem64[ input_0 + 304 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 368 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm12,%rsi + +# qhasm: mem64[ input_0 + 432 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm6,%rsi + +# qhasm: mem64[ input_0 + 496 ] = buf +# asm 1: movq r0=reg128#7 +# asm 2: movddup 56(r0=%xmm6 +movddup 56(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 120 ] x2 +# asm 1: movddup 120(r1=reg128#8 +# asm 2: movddup 120(r1=%xmm7 +movddup 120(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 184 ] x2 +# asm 1: movddup 184(r2=reg128#9 +# asm 2: movddup 184(r2=%xmm8 +movddup 184(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 248 ] x2 +# asm 1: movddup 248(r3=reg128#10 +# asm 2: movddup 248(r3=%xmm9 +movddup 248(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 312 ] x2 +# asm 1: movddup 312(r4=reg128#11 +# asm 2: movddup 312(r4=%xmm10 +movddup 312(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 376 ] x2 +# asm 1: movddup 376(r5=reg128#12 +# asm 2: movddup 376(r5=%xmm11 +movddup 376(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 440 ] x2 +# asm 1: movddup 440(r6=reg128#13 +# asm 2: movddup 440(r6=%xmm12 +movddup 440(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 504 ] x2 +# asm 1: movddup 504(r7=reg128#14 +# asm 2: movddup 504(r7=%xmm13 +movddup 504(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = r4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = r0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = r5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = r1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = r6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = r2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = r7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = r3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: r3 = v00 | v10 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = r2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = r0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = r3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = r1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: r1 = v00 | v10 +# asm 1: vpor r1=reg128#13 +# asm 2: vpor r1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = r6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = r4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#11 +# asm 2: vpor r4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = r7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = r5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: r5 = v00 | v10 +# asm 1: vpor r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = r1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = r0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: r0 = v00 | v10 +# asm 1: vpor r0=reg128#4 +# asm 2: vpor r0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#8 +# asm 2: vpor r1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = r3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = r2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: r2 = v00 | v10 +# asm 1: vpor r2=reg128#9 +# asm 2: vpor r2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = r5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = r4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: r4 = v00 | v10 +# asm 1: vpor r4=reg128#10 +# asm 2: vpor r4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = r7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = r6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: r6 = v00 | v10 +# asm 1: vpor r6=reg128#5 +# asm 2: vpor r6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: buf = r0[0] +# asm 1: pextrq $0x0,buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm3,%rsi + +# qhasm: mem64[ input_0 + 56 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm7,%rsi + +# qhasm: mem64[ input_0 + 120 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm8,%rsi + +# qhasm: mem64[ input_0 + 184 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm0,%rsi + +# qhasm: mem64[ input_0 + 248 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm9,%rsi + +# qhasm: mem64[ input_0 + 312 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm2,%rsi + +# qhasm: mem64[ input_0 + 376 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm4,%rsi + +# qhasm: mem64[ input_0 + 440 ] = buf +# asm 1: movq buf=int64#2 +# asm 2: pextrq $0x0,buf=%rsi +pextrq $0x0,%xmm1,%rsi + +# qhasm: mem64[ input_0 + 504 ] = buf +# asm 1: movq mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE348864F_AVX_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE348864F_AVX_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE348864F_AVX_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE348864F_AVX_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE348864F_AVX_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE348864F_AVX_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE348864F_AVX_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE348864F_AVX_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE348864F_AVX_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE348864F_AVX_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE348864F_AVX_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE348864F_AVX_MASK0_1(%rip),%xmm5 + +# qhasm: r0 = mem64[ input_0 + 0 ] x2 +# asm 1: movddup 0(r0=reg128#7 +# asm 2: movddup 0(r0=%xmm6 +movddup 0(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 8 ] x2 +# asm 1: movddup 8(r1=reg128#8 +# asm 2: movddup 8(r1=%xmm7 +movddup 8(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 16 ] x2 +# asm 1: movddup 16(r2=reg128#9 +# asm 2: movddup 16(r2=%xmm8 +movddup 16(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 24 ] x2 +# asm 1: movddup 24(r3=reg128#10 +# asm 2: movddup 24(r3=%xmm9 +movddup 24(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 32 ] x2 +# asm 1: movddup 32(r4=reg128#11 +# asm 2: movddup 32(r4=%xmm10 +movddup 32(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 40 ] x2 +# asm 1: movddup 40(r5=reg128#12 +# asm 2: movddup 40(r5=%xmm11 +movddup 40(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 48 ] x2 +# asm 1: movddup 48(r6=reg128#13 +# asm 2: movddup 48(r6=%xmm12 +movddup 48(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 56 ] x2 +# asm 1: movddup 56(r7=reg128#14 +# asm 2: movddup 56(r7=%xmm13 +movddup 56(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 0 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 16 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 32 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 64(r0=%xmm6 +movddup 64(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 72 ] x2 +# asm 1: movddup 72(r1=reg128#8 +# asm 2: movddup 72(r1=%xmm7 +movddup 72(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 80 ] x2 +# asm 1: movddup 80(r2=reg128#9 +# asm 2: movddup 80(r2=%xmm8 +movddup 80(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 88 ] x2 +# asm 1: movddup 88(r3=reg128#10 +# asm 2: movddup 88(r3=%xmm9 +movddup 88(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 96 ] x2 +# asm 1: movddup 96(r4=reg128#11 +# asm 2: movddup 96(r4=%xmm10 +movddup 96(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 104 ] x2 +# asm 1: movddup 104(r5=reg128#12 +# asm 2: movddup 104(r5=%xmm11 +movddup 104(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 112 ] x2 +# asm 1: movddup 112(r6=reg128#13 +# asm 2: movddup 112(r6=%xmm12 +movddup 112(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 120 ] x2 +# asm 1: movddup 120(r7=reg128#14 +# asm 2: movddup 120(r7=%xmm13 +movddup 120(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 64 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 80 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 96 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 112 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 128(r0=%xmm6 +movddup 128(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 136 ] x2 +# asm 1: movddup 136(r1=reg128#8 +# asm 2: movddup 136(r1=%xmm7 +movddup 136(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 144 ] x2 +# asm 1: movddup 144(r2=reg128#9 +# asm 2: movddup 144(r2=%xmm8 +movddup 144(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 152 ] x2 +# asm 1: movddup 152(r3=reg128#10 +# asm 2: movddup 152(r3=%xmm9 +movddup 152(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 160 ] x2 +# asm 1: movddup 160(r4=reg128#11 +# asm 2: movddup 160(r4=%xmm10 +movddup 160(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 168 ] x2 +# asm 1: movddup 168(r5=reg128#12 +# asm 2: movddup 168(r5=%xmm11 +movddup 168(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 176 ] x2 +# asm 1: movddup 176(r6=reg128#13 +# asm 2: movddup 176(r6=%xmm12 +movddup 176(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 184 ] x2 +# asm 1: movddup 184(r7=reg128#14 +# asm 2: movddup 184(r7=%xmm13 +movddup 184(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 128 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 144 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 160 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 176 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 192(r0=%xmm6 +movddup 192(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 200 ] x2 +# asm 1: movddup 200(r1=reg128#8 +# asm 2: movddup 200(r1=%xmm7 +movddup 200(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 208 ] x2 +# asm 1: movddup 208(r2=reg128#9 +# asm 2: movddup 208(r2=%xmm8 +movddup 208(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 216 ] x2 +# asm 1: movddup 216(r3=reg128#10 +# asm 2: movddup 216(r3=%xmm9 +movddup 216(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 224 ] x2 +# asm 1: movddup 224(r4=reg128#11 +# asm 2: movddup 224(r4=%xmm10 +movddup 224(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 232 ] x2 +# asm 1: movddup 232(r5=reg128#12 +# asm 2: movddup 232(r5=%xmm11 +movddup 232(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 240 ] x2 +# asm 1: movddup 240(r6=reg128#13 +# asm 2: movddup 240(r6=%xmm12 +movddup 240(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 248 ] x2 +# asm 1: movddup 248(r7=reg128#14 +# asm 2: movddup 248(r7=%xmm13 +movddup 248(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 192 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 208 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 224 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 240 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 256(r0=%xmm6 +movddup 256(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 264 ] x2 +# asm 1: movddup 264(r1=reg128#8 +# asm 2: movddup 264(r1=%xmm7 +movddup 264(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 272 ] x2 +# asm 1: movddup 272(r2=reg128#9 +# asm 2: movddup 272(r2=%xmm8 +movddup 272(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 280 ] x2 +# asm 1: movddup 280(r3=reg128#10 +# asm 2: movddup 280(r3=%xmm9 +movddup 280(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 288 ] x2 +# asm 1: movddup 288(r4=reg128#11 +# asm 2: movddup 288(r4=%xmm10 +movddup 288(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 296 ] x2 +# asm 1: movddup 296(r5=reg128#12 +# asm 2: movddup 296(r5=%xmm11 +movddup 296(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 304 ] x2 +# asm 1: movddup 304(r6=reg128#13 +# asm 2: movddup 304(r6=%xmm12 +movddup 304(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 312 ] x2 +# asm 1: movddup 312(r7=reg128#14 +# asm 2: movddup 312(r7=%xmm13 +movddup 312(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 256 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 272 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 288 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 304 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 320(r0=%xmm6 +movddup 320(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 328 ] x2 +# asm 1: movddup 328(r1=reg128#8 +# asm 2: movddup 328(r1=%xmm7 +movddup 328(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 336 ] x2 +# asm 1: movddup 336(r2=reg128#9 +# asm 2: movddup 336(r2=%xmm8 +movddup 336(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 344 ] x2 +# asm 1: movddup 344(r3=reg128#10 +# asm 2: movddup 344(r3=%xmm9 +movddup 344(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 352 ] x2 +# asm 1: movddup 352(r4=reg128#11 +# asm 2: movddup 352(r4=%xmm10 +movddup 352(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 360 ] x2 +# asm 1: movddup 360(r5=reg128#12 +# asm 2: movddup 360(r5=%xmm11 +movddup 360(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 368 ] x2 +# asm 1: movddup 368(r6=reg128#13 +# asm 2: movddup 368(r6=%xmm12 +movddup 368(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 376 ] x2 +# asm 1: movddup 376(r7=reg128#14 +# asm 2: movddup 376(r7=%xmm13 +movddup 376(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 320 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 336 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 352 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 368 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 384(r0=%xmm6 +movddup 384(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 392 ] x2 +# asm 1: movddup 392(r1=reg128#8 +# asm 2: movddup 392(r1=%xmm7 +movddup 392(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 400 ] x2 +# asm 1: movddup 400(r2=reg128#9 +# asm 2: movddup 400(r2=%xmm8 +movddup 400(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 408 ] x2 +# asm 1: movddup 408(r3=reg128#10 +# asm 2: movddup 408(r3=%xmm9 +movddup 408(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 416 ] x2 +# asm 1: movddup 416(r4=reg128#11 +# asm 2: movddup 416(r4=%xmm10 +movddup 416(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 424 ] x2 +# asm 1: movddup 424(r5=reg128#12 +# asm 2: movddup 424(r5=%xmm11 +movddup 424(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 432 ] x2 +# asm 1: movddup 432(r6=reg128#13 +# asm 2: movddup 432(r6=%xmm12 +movddup 432(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 440 ] x2 +# asm 1: movddup 440(r7=reg128#14 +# asm 2: movddup 440(r7=%xmm13 +movddup 440(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#13 +# asm 2: vpor r3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#10 +# asm 2: vpor r7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#14 +# asm 2: vpor r0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#15 +# asm 2: vpor r1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#13 +# asm 2: vpor r4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#8 +# asm 2: vpor r7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#14 +# asm 2: vpor r1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#15 +# asm 2: vpor r2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#11 +# asm 2: vpor r3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#12 +# asm 2: vpor r4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#9 +# asm 2: vpor r5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#13 +# asm 2: vpor r6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#7 +# asm 2: vpor r7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm13,%xmm9,%xmm7 + +# qhasm: mem128[ input_0 + 384 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm10,%xmm14,%xmm7 + +# qhasm: mem128[ input_0 + 400 ] = t0 +# asm 1: movdqu t0=reg128#8 +# asm 2: vpunpcklqdq t0=%xmm7 +vpunpcklqdq %xmm8,%xmm11,%xmm7 + +# qhasm: mem128[ input_0 + 416 ] = t0 +# asm 1: movdqu t0=reg128#7 +# asm 2: vpunpcklqdq t0=%xmm6 +vpunpcklqdq %xmm6,%xmm12,%xmm6 + +# qhasm: mem128[ input_0 + 432 ] = t0 +# asm 1: movdqu r0=reg128#7 +# asm 2: movddup 448(r0=%xmm6 +movddup 448(%rdi),%xmm6 + +# qhasm: r1 = mem64[ input_0 + 456 ] x2 +# asm 1: movddup 456(r1=reg128#8 +# asm 2: movddup 456(r1=%xmm7 +movddup 456(%rdi),%xmm7 + +# qhasm: r2 = mem64[ input_0 + 464 ] x2 +# asm 1: movddup 464(r2=reg128#9 +# asm 2: movddup 464(r2=%xmm8 +movddup 464(%rdi),%xmm8 + +# qhasm: r3 = mem64[ input_0 + 472 ] x2 +# asm 1: movddup 472(r3=reg128#10 +# asm 2: movddup 472(r3=%xmm9 +movddup 472(%rdi),%xmm9 + +# qhasm: r4 = mem64[ input_0 + 480 ] x2 +# asm 1: movddup 480(r4=reg128#11 +# asm 2: movddup 480(r4=%xmm10 +movddup 480(%rdi),%xmm10 + +# qhasm: r5 = mem64[ input_0 + 488 ] x2 +# asm 1: movddup 488(r5=reg128#12 +# asm 2: movddup 488(r5=%xmm11 +movddup 488(%rdi),%xmm11 + +# qhasm: r6 = mem64[ input_0 + 496 ] x2 +# asm 1: movddup 496(r6=reg128#13 +# asm 2: movddup 496(r6=%xmm12 +movddup 496(%rdi),%xmm12 + +# qhasm: r7 = mem64[ input_0 + 504 ] x2 +# asm 1: movddup 504(r7=reg128#14 +# asm 2: movddup 504(r7=%xmm13 +movddup 504(%rdi),%xmm13 + +# qhasm: v00 = r0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = r4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = r4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r0=reg128#15 +# asm 2: vpor r0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: r4 = v01 | v11 +# asm 1: vpor r4=reg128#7 +# asm 2: vpor r4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = r1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = r5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = r5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r1=reg128#11 +# asm 2: vpor r1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#8 +# asm 2: vpor r5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = r2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = r6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = r6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#9 +# asm 2: vpor r6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = r3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = r7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = r7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = r0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = r2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = r2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r0=reg128#10 +# asm 2: vpor r0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: r2 = v01 | v11 +# asm 1: vpor r2=reg128#12 +# asm 2: vpor r2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = r1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = r3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = r3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r1=reg128#13 +# asm 2: vpor r1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = r4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = r6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = r6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r4=reg128#11 +# asm 2: vpor r4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: r6 = v01 | v11 +# asm 1: vpor r6=reg128#7 +# asm 2: vpor r6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = r5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = r7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = r7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = r0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = r1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = r1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r0=reg128#4 +# asm 2: vpor r0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: r1 = v01 | v11 +# asm 1: vpor r1=reg128#8 +# asm 2: vpor r1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = r2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = r3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = r3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r2=reg128#9 +# asm 2: vpor r2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: r3 = v01 | v11 +# asm 1: vpor r3=reg128#1 +# asm 2: vpor r3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = r4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = r5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = r5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r4=reg128#10 +# asm 2: vpor r4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: r5 = v01 | v11 +# asm 1: vpor r5=reg128#3 +# asm 2: vpor r5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = r6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = r7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = r7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,r6=reg128#5 +# asm 2: vpor r6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: r7 = v01 | v11 +# asm 1: vpor r7=reg128#2 +# asm 2: vpor r7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: t0 = r0[0]r1[0] +# asm 1: vpunpcklqdq t0=reg128#4 +# asm 2: vpunpcklqdq t0=%xmm3 +vpunpcklqdq %xmm7,%xmm3,%xmm3 + +# qhasm: mem128[ input_0 + 448 ] = t0 +# asm 1: movdqu t0=reg128#1 +# asm 2: vpunpcklqdq t0=%xmm0 +vpunpcklqdq %xmm0,%xmm8,%xmm0 + +# qhasm: mem128[ input_0 + 464 ] = t0 +# asm 1: movdqu t0=reg128#1 +# asm 2: vpunpcklqdq t0=%xmm0 +vpunpcklqdq %xmm2,%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 480 ] = t0 +# asm 1: movdqu t0=reg128#1 +# asm 2: vpunpcklqdq t0=%xmm0 +vpunpcklqdq %xmm1,%xmm4,%xmm0 + +# qhasm: mem128[ input_0 + 496 ] = t0 +# asm 1: movdqu +#include + +void PQCLEAN_MCELIECE348864F_AVX_uint32_sort(uint32_t *x, size_t n); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/update_asm.S b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/update_asm.S new file mode 100644 index 0000000000..6ab3338c83 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/update_asm.S @@ -0,0 +1,354 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 s0 + +# qhasm: int64 s1 + +# qhasm: enter update_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864F_AVX_update_asm +.global PQCLEAN_MCELIECE348864F_AVX_update_asm +_PQCLEAN_MCELIECE348864F_AVX_update_asm: +PQCLEAN_MCELIECE348864F_AVX_update_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: s1 = input_1 +# asm 1: mov s1=int64#2 +# asm 2: mov s1=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE348864F_AVX_store2(unsigned char *dest, gf a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE348864F_AVX_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE348864F_AVX_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE348864F_AVX_irr_load(uint64_t *out, const unsigned char *in) { + int i, j; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE348864F_AVX_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + out[i] = 0; + } + + for (i = SYS_T; i >= 0; i--) { + for (j = 0; j < GFBITS; j++) { + out[j] <<= 1; + out[j] |= (irr[i] >> j) & 1; + } + } +} + +void PQCLEAN_MCELIECE348864F_AVX_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE348864F_AVX_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +gf PQCLEAN_MCELIECE348864F_AVX_bitrev(gf a) { + a = ((a & 0x00FF) << 8) | ((a & 0xFF00) >> 8); + a = ((a & 0x0F0F) << 4) | ((a & 0xF0F0) >> 4); + a = ((a & 0x3333) << 2) | ((a & 0xCCCC) >> 2); + a = ((a & 0x5555) << 1) | ((a & 0xAAAA) >> 1); + + return a >> 4; +} + +vec128 PQCLEAN_MCELIECE348864F_AVX_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE348864F_AVX_vec128_set2x( PQCLEAN_MCELIECE348864F_AVX_load8(in), PQCLEAN_MCELIECE348864F_AVX_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE348864F_AVX_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE348864F_AVX_store8(out + 0, PQCLEAN_MCELIECE348864F_AVX_vec128_extract(in, 0)); + PQCLEAN_MCELIECE348864F_AVX_store8(out + 8, PQCLEAN_MCELIECE348864F_AVX_vec128_extract(in, 1)); +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/util.h b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/util.h new file mode 100644 index 0000000000..e9a6258ee3 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/util.h @@ -0,0 +1,33 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_UTIL_H +#define PQCLEAN_MCELIECE348864F_AVX_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "gf.h" +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE348864F_AVX_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE348864F_AVX_store2(unsigned char *dest, gf a); + +uint16_t PQCLEAN_MCELIECE348864F_AVX_load2(const unsigned char *src); + +uint32_t PQCLEAN_MCELIECE348864F_AVX_load4(const unsigned char *src); + +void PQCLEAN_MCELIECE348864F_AVX_irr_load(uint64_t *out, const unsigned char *in); + +void PQCLEAN_MCELIECE348864F_AVX_store8(unsigned char *out, uint64_t in); + +uint64_t PQCLEAN_MCELIECE348864F_AVX_load8(const unsigned char *in); + +gf PQCLEAN_MCELIECE348864F_AVX_bitrev(gf a); + +vec128 PQCLEAN_MCELIECE348864F_AVX_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE348864F_AVX_store16(unsigned char *out, vec128 in); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec.c b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec.c new file mode 100644 index 0000000000..50e05ee04b --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec.c @@ -0,0 +1,25 @@ +#include "vec.h" + +#include "params.h" + +extern void PQCLEAN_MCELIECE348864F_AVX_vec_mul_asm(uint64_t *, const uint64_t *, const uint64_t *); +extern void PQCLEAN_MCELIECE348864F_AVX_vec_mul_sp_asm(uint64_t *, const uint64_t *, const uint64_t *); + + +void PQCLEAN_MCELIECE348864F_AVX_vec_mul(uint64_t *h, const uint64_t *f, const uint64_t *g) { + PQCLEAN_MCELIECE348864F_AVX_vec_mul_asm(h, f, g); +} + + +void PQCLEAN_MCELIECE348864F_AVX_vec_mul_sp(uint64_t *h, const uint64_t *f, const uint64_t *g) { + PQCLEAN_MCELIECE348864F_AVX_vec_mul_sp_asm(h, f, g); +} + +void PQCLEAN_MCELIECE348864F_AVX_vec_add(uint64_t *h, const uint64_t *f, const uint64_t *g) { + int b; + + for (b = 0; b < GFBITS; b++) { + h[b] = f[b] ^ g[b]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec.h b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec.h new file mode 100644 index 0000000000..468f3e44e9 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec.h @@ -0,0 +1,13 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_VEC_H +#define PQCLEAN_MCELIECE348864F_AVX_VEC_H + +#include + + +void PQCLEAN_MCELIECE348864F_AVX_vec_mul(uint64_t *h, const uint64_t *f, const uint64_t *g); + +void PQCLEAN_MCELIECE348864F_AVX_vec_mul_sp(uint64_t *h, const uint64_t *f, const uint64_t *g); + +void PQCLEAN_MCELIECE348864F_AVX_vec_add(uint64_t *h, const uint64_t *f, const uint64_t *g); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec128.c b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec128.c new file mode 100644 index 0000000000..e92772f379 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec128.c @@ -0,0 +1,83 @@ +#include "vec128.h" + +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +int PQCLEAN_MCELIECE348864F_AVX_vec128_testz(vec128 a) { + return _mm_testz_si128(a, a); +} + +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE348864F_AVX_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE348864F_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE348864F_AVX_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE348864F_AVX_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE348864F_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE348864F_AVX_vec128_mul_asm(h, f, g, 16); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec128.h b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec128.h new file mode 100644 index 0000000000..15c81fcd39 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec128.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_MCELIECE348864F_AVX_VEC128_H +#define PQCLEAN_MCELIECE348864F_AVX_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE348864F_AVX_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE348864F_AVX_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE348864F_AVX_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE348864F_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE348864F_AVX_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE348864F_AVX_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE348864F_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec128_mul_asm.S b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec128_mul_asm.S new file mode 100644 index 0000000000..b8e332959d --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec128_mul_asm.S @@ -0,0 +1,1369 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 b2 + +# qhasm: reg256 b3 + +# qhasm: reg256 b4 + +# qhasm: reg256 b5 + +# qhasm: reg256 b6 + +# qhasm: reg256 b7 + +# qhasm: reg256 b8 + +# qhasm: reg256 b9 + +# qhasm: reg256 b10 + +# qhasm: reg256 b11 + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r + +# qhasm: reg128 h0 + +# qhasm: reg128 h1 + +# qhasm: reg128 h2 + +# qhasm: reg128 h3 + +# qhasm: reg128 h4 + +# qhasm: reg128 h5 + +# qhasm: reg128 h6 + +# qhasm: reg128 h7 + +# qhasm: reg128 h8 + +# qhasm: reg128 h9 + +# qhasm: reg128 h10 + +# qhasm: reg128 h11 + +# qhasm: reg128 h12 + +# qhasm: reg128 h13 + +# qhasm: reg128 h14 + +# qhasm: reg128 h15 + +# qhasm: reg128 h16 + +# qhasm: reg128 h17 + +# qhasm: reg128 h18 + +# qhasm: reg128 h19 + +# qhasm: reg128 h20 + +# qhasm: reg128 h21 + +# qhasm: reg128 h22 + +# qhasm: stack4864 buf + +# qhasm: int64 ptr + +# qhasm: int64 tmp + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864F_AVX_vec128_mul_asm +.global PQCLEAN_MCELIECE348864F_AVX_vec128_mul_asm +_PQCLEAN_MCELIECE348864F_AVX_vec128_mul_asm: +PQCLEAN_MCELIECE348864F_AVX_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $608,%r11 +sub %r11,%rsp + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#4 +# asm 2: leaq ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: b11 = mem128[ input_2 + 176 ] x2 +# asm 1: vbroadcasti128 176(b11=reg256#1 +# asm 2: vbroadcasti128 176(b11=%ymm0 +vbroadcasti128 176(%rdx), %ymm0 + +# qhasm: a5[0] = mem128[ input_1 + 80 ] +# asm 1: vinsertf128 $0x0,80(r16=reg256#3 +# asm 2: vpand r16=%ymm2 +vpand %ymm0,%ymm1,%ymm2 + +# qhasm: mem256[ ptr + 512 ] = r16 +# asm 1: vmovupd r15=reg256#4 +# asm 2: vpand r15=%ymm3 +vpand %ymm0,%ymm2,%ymm3 + +# qhasm: a3[0] = mem128[ input_1 + 48 ] +# asm 1: vinsertf128 $0x0,48(r14=reg256#6 +# asm 2: vpand r14=%ymm5 +vpand %ymm0,%ymm4,%ymm5 + +# qhasm: a2[0] = mem128[ input_1 + 32 ] +# asm 1: vinsertf128 $0x0,32(r13=reg256#8 +# asm 2: vpand r13=%ymm7 +vpand %ymm0,%ymm6,%ymm7 + +# qhasm: a1[0] = mem128[ input_1 + 16 ] +# asm 1: vinsertf128 $0x0,16(r12=reg256#10 +# asm 2: vpand r12=%ymm9 +vpand %ymm0,%ymm8,%ymm9 + +# qhasm: a0[0] = mem128[ input_1 + 0 ] +# asm 1: vinsertf128 $0x0,0(r11=reg256#1 +# asm 2: vpand r11=%ymm0 +vpand %ymm0,%ymm10,%ymm0 + +# qhasm: b10 = mem128[ input_2 + 160 ] x2 +# asm 1: vbroadcasti128 160(b10=reg256#12 +# asm 2: vbroadcasti128 160(b10=%ymm11 +vbroadcasti128 160(%rdx), %ymm11 + +# qhasm: r = b10 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm2,%ymm3 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm4,%ymm3 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm6,%ymm3 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm8,%ymm3 + +# qhasm: r11 ^= r +# asm 1: vpxor r10=reg256#4 +# asm 2: vpand r10=%ymm3 +vpand %ymm11,%ymm10,%ymm3 + +# qhasm: b9 = mem128[ input_2 + 144 ] x2 +# asm 1: vbroadcasti128 144(b9=reg256#12 +# asm 2: vbroadcasti128 144(b9=%ymm11 +vbroadcasti128 144(%rdx), %ymm11 + +# qhasm: r = b9 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm2,%ymm5 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm4,%ymm5 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm6,%ymm5 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm8,%ymm5 + +# qhasm: r10 ^= r +# asm 1: vpxor r9=reg256#6 +# asm 2: vpand r9=%ymm5 +vpand %ymm11,%ymm10,%ymm5 + +# qhasm: b8 = mem128[ input_2 + 128 ] x2 +# asm 1: vbroadcasti128 128(b8=reg256#12 +# asm 2: vbroadcasti128 128(b8=%ymm11 +vbroadcasti128 128(%rdx), %ymm11 + +# qhasm: r = b8 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm2,%ymm7 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm4,%ymm7 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm6,%ymm7 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm8,%ymm7 + +# qhasm: r9 ^= r +# asm 1: vpxor r8=reg256#8 +# asm 2: vpand r8=%ymm7 +vpand %ymm11,%ymm10,%ymm7 + +# qhasm: b7 = mem128[ input_2 + 112 ] x2 +# asm 1: vbroadcasti128 112(b7=reg256#12 +# asm 2: vbroadcasti128 112(b7=%ymm11 +vbroadcasti128 112(%rdx), %ymm11 + +# qhasm: r = b7 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm2,%ymm9 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm4,%ymm9 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm6,%ymm9 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm8,%ymm9 + +# qhasm: r8 ^= r +# asm 1: vpxor r7=reg256#10 +# asm 2: vpand r7=%ymm9 +vpand %ymm11,%ymm10,%ymm9 + +# qhasm: b6 = mem128[ input_2 + 96 ] x2 +# asm 1: vbroadcasti128 96(b6=reg256#12 +# asm 2: vbroadcasti128 96(b6=%ymm11 +vbroadcasti128 96(%rdx), %ymm11 + +# qhasm: r = b6 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm2,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm4,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm6,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm8,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r6=reg256#1 +# asm 2: vpand r6=%ymm0 +vpand %ymm11,%ymm10,%ymm0 + +# qhasm: b5 = mem128[ input_2 + 80 ] x2 +# asm 1: vbroadcasti128 80(b5=reg256#12 +# asm 2: vbroadcasti128 80(b5=%ymm11 +vbroadcasti128 80(%rdx), %ymm11 + +# qhasm: r = b5 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm2,%ymm3 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm4,%ymm3 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm6,%ymm3 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm11,%ymm8,%ymm3 + +# qhasm: r6 ^= r +# asm 1: vpxor r5=reg256#4 +# asm 2: vpand r5=%ymm3 +vpand %ymm11,%ymm10,%ymm3 + +# qhasm: b4 = mem128[ input_2 + 64 ] x2 +# asm 1: vbroadcasti128 64(b4=reg256#12 +# asm 2: vbroadcasti128 64(b4=%ymm11 +vbroadcasti128 64(%rdx), %ymm11 + +# qhasm: r = b4 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm2,%ymm5 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm4,%ymm5 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm6,%ymm5 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm11,%ymm8,%ymm5 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#6 +# asm 2: vpand r4=%ymm5 +vpand %ymm11,%ymm10,%ymm5 + +# qhasm: b3 = mem128[ input_2 + 48 ] x2 +# asm 1: vbroadcasti128 48(b3=reg256#12 +# asm 2: vbroadcasti128 48(b3=%ymm11 +vbroadcasti128 48(%rdx), %ymm11 + +# qhasm: r = b3 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm2,%ymm7 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm4,%ymm7 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm6,%ymm7 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm11,%ymm8,%ymm7 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#8 +# asm 2: vpand r3=%ymm7 +vpand %ymm11,%ymm10,%ymm7 + +# qhasm: b2 = mem128[ input_2 + 32 ] x2 +# asm 1: vbroadcasti128 32(b2=reg256#12 +# asm 2: vbroadcasti128 32(b2=%ymm11 +vbroadcasti128 32(%rdx), %ymm11 + +# qhasm: r = b2 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm2,%ymm9 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm4,%ymm9 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm6,%ymm9 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm11,%ymm8,%ymm9 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#10 +# asm 2: vpand r2=%ymm9 +vpand %ymm11,%ymm10,%ymm9 + +# qhasm: b1 = mem128[ input_2 + 16 ] x2 +# asm 1: vbroadcasti128 16(b1=reg256#12 +# asm 2: vbroadcasti128 16(b1=%ymm11 +vbroadcasti128 16(%rdx), %ymm11 + +# qhasm: r = b1 & a5 +# asm 1: vpand r=reg256#13 +# asm 2: vpand r=%ymm12 +vpand %ymm11,%ymm1,%ymm12 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm2,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm4,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm6,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm11,%ymm8,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#1 +# asm 2: vpand r1=%ymm0 +vpand %ymm11,%ymm10,%ymm0 + +# qhasm: b0 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b0=reg256#12 +# asm 2: vbroadcasti128 0(b0=%ymm11 +vbroadcasti128 0(%rdx), %ymm11 + +# qhasm: r = b0 & a5 +# asm 1: vpand r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm11,%ymm1,%ymm1 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm11,%ymm2,%ymm1 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm11,%ymm4,%ymm1 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm11,%ymm6,%ymm1 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm11,%ymm8,%ymm1 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#2 +# asm 2: vpand r0=%ymm1 +vpand %ymm11,%ymm10,%ymm1 + +# qhasm: mem256[ ptr + 128 ] = r4 +# asm 1: vmovupd h22=reg128#1 +# asm 2: movdqu 528(h22=%xmm0 +movdqu 528(%rcx),%xmm0 + +# qhasm: h13 = h22 +# asm 1: movdqa h13=reg128#2 +# asm 2: movdqa h13=%xmm1 +movdqa %xmm0,%xmm1 + +# qhasm: h10 = h22 +# asm 1: movdqa h10=reg128#1 +# asm 2: movdqa h10=%xmm0 +movdqa %xmm0,%xmm0 + +# qhasm: h21 = mem128[ ptr + 496 ] +# asm 1: movdqu 496(h21=reg128#3 +# asm 2: movdqu 496(h21=%xmm2 +movdqu 496(%rcx),%xmm2 + +# qhasm: h12 = h21 +# asm 1: movdqa h12=reg128#4 +# asm 2: movdqa h12=%xmm3 +movdqa %xmm2,%xmm3 + +# qhasm: h9 = h21 +# asm 1: movdqa h9=reg128#3 +# asm 2: movdqa h9=%xmm2 +movdqa %xmm2,%xmm2 + +# qhasm: h20 = mem128[ ptr + 464 ] +# asm 1: movdqu 464(h20=reg128#5 +# asm 2: movdqu 464(h20=%xmm4 +movdqu 464(%rcx),%xmm4 + +# qhasm: h11 = h20 +# asm 1: movdqa h11=reg128#6 +# asm 2: movdqa h11=%xmm5 +movdqa %xmm4,%xmm5 + +# qhasm: h8 = h20 +# asm 1: movdqa h8=reg128#5 +# asm 2: movdqa h8=%xmm4 +movdqa %xmm4,%xmm4 + +# qhasm: h19 = mem128[ ptr + 432 ] +# asm 1: movdqu 432(h19=reg128#7 +# asm 2: movdqu 432(h19=%xmm6 +movdqu 432(%rcx),%xmm6 + +# qhasm: h10 = h10 ^ h19 +# asm 1: vpxor h10=reg128#1 +# asm 2: vpxor h10=%xmm0 +vpxor %xmm6,%xmm0,%xmm0 + +# qhasm: h7 = h19 +# asm 1: movdqa h7=reg128#7 +# asm 2: movdqa h7=%xmm6 +movdqa %xmm6,%xmm6 + +# qhasm: h18 = mem128[ ptr + 400 ] +# asm 1: movdqu 400(h18=reg128#8 +# asm 2: movdqu 400(h18=%xmm7 +movdqu 400(%rcx),%xmm7 + +# qhasm: h9 = h9 ^ h18 +# asm 1: vpxor h9=reg128#3 +# asm 2: vpxor h9=%xmm2 +vpxor %xmm7,%xmm2,%xmm2 + +# qhasm: h6 = h18 +# asm 1: movdqa h6=reg128#8 +# asm 2: movdqa h6=%xmm7 +movdqa %xmm7,%xmm7 + +# qhasm: h17 = mem128[ ptr + 368 ] +# asm 1: movdqu 368(h17=reg128#9 +# asm 2: movdqu 368(h17=%xmm8 +movdqu 368(%rcx),%xmm8 + +# qhasm: h8 = h8 ^ h17 +# asm 1: vpxor h8=reg128#5 +# asm 2: vpxor h8=%xmm4 +vpxor %xmm8,%xmm4,%xmm4 + +# qhasm: h5 = h17 +# asm 1: movdqa h5=reg128#9 +# asm 2: movdqa h5=%xmm8 +movdqa %xmm8,%xmm8 + +# qhasm: h16 = mem128[ ptr + 336 ] +# asm 1: movdqu 336(h16=reg128#10 +# asm 2: movdqu 336(h16=%xmm9 +movdqu 336(%rcx),%xmm9 + +# qhasm: h16 = h16 ^ mem128[ ptr + 512 ] +# asm 1: vpxor 512(h16=reg128#10 +# asm 2: vpxor 512(h16=%xmm9 +vpxor 512(%rcx),%xmm9,%xmm9 + +# qhasm: h7 = h7 ^ h16 +# asm 1: vpxor h7=reg128#7 +# asm 2: vpxor h7=%xmm6 +vpxor %xmm9,%xmm6,%xmm6 + +# qhasm: h4 = h16 +# asm 1: movdqa h4=reg128#10 +# asm 2: movdqa h4=%xmm9 +movdqa %xmm9,%xmm9 + +# qhasm: h15 = mem128[ ptr + 304 ] +# asm 1: movdqu 304(h15=reg128#11 +# asm 2: movdqu 304(h15=%xmm10 +movdqu 304(%rcx),%xmm10 + +# qhasm: h15 = h15 ^ mem128[ ptr + 480 ] +# asm 1: vpxor 480(h15=reg128#11 +# asm 2: vpxor 480(h15=%xmm10 +vpxor 480(%rcx),%xmm10,%xmm10 + +# qhasm: h6 = h6 ^ h15 +# asm 1: vpxor h6=reg128#8 +# asm 2: vpxor h6=%xmm7 +vpxor %xmm10,%xmm7,%xmm7 + +# qhasm: h3 = h15 +# asm 1: movdqa h3=reg128#11 +# asm 2: movdqa h3=%xmm10 +movdqa %xmm10,%xmm10 + +# qhasm: h14 = mem128[ ptr + 272 ] +# asm 1: movdqu 272(h14=reg128#12 +# asm 2: movdqu 272(h14=%xmm11 +movdqu 272(%rcx),%xmm11 + +# qhasm: h14 = h14 ^ mem128[ ptr + 448 ] +# asm 1: vpxor 448(h14=reg128#12 +# asm 2: vpxor 448(h14=%xmm11 +vpxor 448(%rcx),%xmm11,%xmm11 + +# qhasm: h5 = h5 ^ h14 +# asm 1: vpxor h5=reg128#9 +# asm 2: vpxor h5=%xmm8 +vpxor %xmm11,%xmm8,%xmm8 + +# qhasm: h2 = h14 +# asm 1: movdqa h2=reg128#12 +# asm 2: movdqa h2=%xmm11 +movdqa %xmm11,%xmm11 + +# qhasm: h13 = h13 ^ mem128[ ptr + 240 ] +# asm 1: vpxor 240(h13=reg128#2 +# asm 2: vpxor 240(h13=%xmm1 +vpxor 240(%rcx),%xmm1,%xmm1 + +# qhasm: h13 = h13 ^ mem128[ ptr + 416 ] +# asm 1: vpxor 416(h13=reg128#2 +# asm 2: vpxor 416(h13=%xmm1 +vpxor 416(%rcx),%xmm1,%xmm1 + +# qhasm: h4 = h4 ^ h13 +# asm 1: vpxor h4=reg128#10 +# asm 2: vpxor h4=%xmm9 +vpxor %xmm1,%xmm9,%xmm9 + +# qhasm: h1 = h13 +# asm 1: movdqa h1=reg128#2 +# asm 2: movdqa h1=%xmm1 +movdqa %xmm1,%xmm1 + +# qhasm: h12 = h12 ^ mem128[ ptr + 208 ] +# asm 1: vpxor 208(h12=reg128#4 +# asm 2: vpxor 208(h12=%xmm3 +vpxor 208(%rcx),%xmm3,%xmm3 + +# qhasm: h12 = h12 ^ mem128[ ptr + 384 ] +# asm 1: vpxor 384(h12=reg128#4 +# asm 2: vpxor 384(h12=%xmm3 +vpxor 384(%rcx),%xmm3,%xmm3 + +# qhasm: h3 = h3 ^ h12 +# asm 1: vpxor h3=reg128#11 +# asm 2: vpxor h3=%xmm10 +vpxor %xmm3,%xmm10,%xmm10 + +# qhasm: h0 = h12 +# asm 1: movdqa h0=reg128#4 +# asm 2: movdqa h0=%xmm3 +movdqa %xmm3,%xmm3 + +# qhasm: h11 = h11 ^ mem128[ ptr + 352 ] +# asm 1: vpxor 352(h11=reg128#6 +# asm 2: vpxor 352(h11=%xmm5 +vpxor 352(%rcx),%xmm5,%xmm5 + +# qhasm: h11 = h11 ^ mem128[ ptr + 176 ] +# asm 1: vpxor 176(h11=reg128#6 +# asm 2: vpxor 176(h11=%xmm5 +vpxor 176(%rcx),%xmm5,%xmm5 + +# qhasm: mem128[ input_0 + 176 ] = h11 +# asm 1: movdqu h10=reg128#1 +# asm 2: vpxor 320(h10=%xmm0 +vpxor 320(%rcx),%xmm0,%xmm0 + +# qhasm: h10 = h10 ^ mem128[ ptr + 144 ] +# asm 1: vpxor 144(h10=reg128#1 +# asm 2: vpxor 144(h10=%xmm0 +vpxor 144(%rcx),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 160 ] = h10 +# asm 1: movdqu h9=reg128#1 +# asm 2: vpxor 288(h9=%xmm0 +vpxor 288(%rcx),%xmm2,%xmm0 + +# qhasm: h9 = h9 ^ mem128[ ptr + 112 ] +# asm 1: vpxor 112(h9=reg128#1 +# asm 2: vpxor 112(h9=%xmm0 +vpxor 112(%rcx),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 144 ] = h9 +# asm 1: movdqu h8=reg128#1 +# asm 2: vpxor 256(h8=%xmm0 +vpxor 256(%rcx),%xmm4,%xmm0 + +# qhasm: h8 = h8 ^ mem128[ ptr + 80 ] +# asm 1: vpxor 80(h8=reg128#1 +# asm 2: vpxor 80(h8=%xmm0 +vpxor 80(%rcx),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 128 ] = h8 +# asm 1: movdqu h7=reg128#1 +# asm 2: vpxor 224(h7=%xmm0 +vpxor 224(%rcx),%xmm6,%xmm0 + +# qhasm: h7 = h7 ^ mem128[ ptr + 48 ] +# asm 1: vpxor 48(h7=reg128#1 +# asm 2: vpxor 48(h7=%xmm0 +vpxor 48(%rcx),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 112 ] = h7 +# asm 1: movdqu h6=reg128#1 +# asm 2: vpxor 192(h6=%xmm0 +vpxor 192(%rcx),%xmm7,%xmm0 + +# qhasm: h6 = h6 ^ mem128[ ptr + 16 ] +# asm 1: vpxor 16(h6=reg128#1 +# asm 2: vpxor 16(h6=%xmm0 +vpxor 16(%rcx),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 96 ] = h6 +# asm 1: movdqu h5=reg128#1 +# asm 2: vpxor 160(h5=%xmm0 +vpxor 160(%rcx),%xmm8,%xmm0 + +# qhasm: mem128[ input_0 + 80 ] = h5 +# asm 1: movdqu h4=reg128#1 +# asm 2: vpxor 128(h4=%xmm0 +vpxor 128(%rcx),%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 64 ] = h4 +# asm 1: movdqu h3=reg128#1 +# asm 2: vpxor 96(h3=%xmm0 +vpxor 96(%rcx),%xmm10,%xmm0 + +# qhasm: mem128[ input_0 + 48 ] = h3 +# asm 1: movdqu h2=reg128#1 +# asm 2: vpxor 64(h2=%xmm0 +vpxor 64(%rcx),%xmm11,%xmm0 + +# qhasm: mem128[ input_0 + 32 ] = h2 +# asm 1: movdqu h1=reg128#1 +# asm 2: vpxor 32(h1=%xmm0 +vpxor 32(%rcx),%xmm1,%xmm0 + +# qhasm: mem128[ input_0 + 16 ] = h1 +# asm 1: movdqu h0=reg128#1 +# asm 2: vpxor 0(h0=%xmm0 +vpxor 0(%rcx),%xmm3,%xmm0 + +# qhasm: mem128[ input_0 + 0 ] = h0 +# asm 1: movdqu + +typedef __m256i vec256; + +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_set1_16b(uint16_t a); +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_setzero(void); +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_set4x(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); + +// Extract requires a literal argument so need to be macros +#define PQCLEAN_MCELIECE348864F_AVX_vec256_extract2x(a,i) ((vec128) _mm256_extractf128_si256((vec256) (a),(i))) +#define PQCLEAN_MCELIECE348864F_AVX_vec256_extract(a,i) ((uint64_t) _mm256_extract_epi64((vec256) (a),(i))) + +int PQCLEAN_MCELIECE348864F_AVX_vec256_testz(vec256 a); +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_and(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_xor(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_or(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_sll_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_srl_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_low(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_high(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_low_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_unpack_high_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE348864F_AVX_vec256_or_reduce(const vec256 *a); +void PQCLEAN_MCELIECE348864F_AVX_vec256_copy(vec256 *dest, const vec256 *src); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE348864F_AVX_vec256_mul(vec256 *h, vec256 *f, const vec256 *g); +void PQCLEAN_MCELIECE348864F_AVX_vec256_sq(vec256 * /*out*/, const vec256 * /*in*/); +void PQCLEAN_MCELIECE348864F_AVX_vec256_inv(vec256 * /*out*/, const vec256 * /*in*/); + +extern void PQCLEAN_MCELIECE348864F_AVX_vec256_maa_asm(vec256 *, vec256 *, const vec256 *); +extern void PQCLEAN_MCELIECE348864F_AVX_vec256_ama_asm(vec256 *, vec256 *, const vec256 *); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec256_mul_asm.S b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec256_mul_asm.S new file mode 100644 index 0000000000..e12810b2b2 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec256_mul_asm.S @@ -0,0 +1,1736 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 a7 + +# qhasm: reg256 a8 + +# qhasm: reg256 a9 + +# qhasm: reg256 a10 + +# qhasm: reg256 a11 + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r + +# qhasm: enter vec256_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864F_AVX_vec256_mul_asm +.global PQCLEAN_MCELIECE348864F_AVX_vec256_mul_asm +_PQCLEAN_MCELIECE348864F_AVX_vec256_mul_asm: +PQCLEAN_MCELIECE348864F_AVX_vec256_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#2 +# asm 2: vmovupd 352(a11=%ymm1 +vmovupd 352(%rsi),%ymm1 + +# qhasm: r11 = a11 & b0 +# asm 1: vpand r11=reg256#3 +# asm 2: vpand r11=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r12 = a11 & mem256[input_2 + 32] +# asm 1: vpand 32(r12=reg256#4 +# asm 2: vpand 32(r12=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r13 = a11 & mem256[input_2 + 64] +# asm 1: vpand 64(r13=reg256#5 +# asm 2: vpand 64(r13=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r14 = a11 & mem256[input_2 + 96] +# asm 1: vpand 96(r14=reg256#6 +# asm 2: vpand 96(r14=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r15 = a11 & mem256[input_2 + 128] +# asm 1: vpand 128(r15=reg256#7 +# asm 2: vpand 128(r15=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r16 = a11 & mem256[input_2 + 160] +# asm 1: vpand 160(r16=reg256#8 +# asm 2: vpand 160(r16=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r17 = a11 & mem256[input_2 + 192] +# asm 1: vpand 192(r17=reg256#9 +# asm 2: vpand 192(r17=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r18 = a11 & mem256[input_2 + 224] +# asm 1: vpand 224(r18=reg256#10 +# asm 2: vpand 224(r18=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r19 = a11 & mem256[input_2 + 256] +# asm 1: vpand 256(r19=reg256#11 +# asm 2: vpand 256(r19=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r20 = a11 & mem256[input_2 + 288] +# asm 1: vpand 288(r20=reg256#12 +# asm 2: vpand 288(r20=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r21 = a11 & mem256[input_2 + 320] +# asm 1: vpand 320(r21=reg256#13 +# asm 2: vpand 320(r21=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r22 = a11 & mem256[input_2 + 352] +# asm 1: vpand 352(r22=reg256#2 +# asm 2: vpand 352(r22=%ymm1 +vpand 352(%rdx),%ymm1,%ymm1 + +# qhasm: r13 ^= r22 +# asm 1: vpxor r10=reg256#2 +# asm 2: vmovapd r10=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#14 +# asm 2: vmovupd 320(a10=%ymm13 +vmovupd 320(%rsi),%ymm13 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r21 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#14 +# asm 2: vmovupd 288(a9=%ymm13 +vmovupd 288(%rsi),%ymm13 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r20 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#14 +# asm 2: vmovupd 256(a8=%ymm13 +vmovupd 256(%rsi),%ymm13 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r19 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#14 +# asm 2: vmovupd 224(a7=%ymm13 +vmovupd 224(%rsi),%ymm13 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r18 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#14 +# asm 2: vmovupd 192(a6=%ymm13 +vmovupd 192(%rsi),%ymm13 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r17 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#14 +# asm 2: vmovupd 160(a5=%ymm13 +vmovupd 160(%rsi),%ymm13 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r16 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#14 +# asm 2: vmovupd 128(a4=%ymm13 +vmovupd 128(%rsi),%ymm13 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r15 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#14 +# asm 2: vmovupd 96(a3=%ymm13 +vmovupd 96(%rsi),%ymm13 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r14 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#14 +# asm 2: vmovupd 64(a2=%ymm13 +vmovupd 64(%rsi),%ymm13 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r13 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#14 +# asm 2: vmovupd 32(a1=%ymm13 +vmovupd 32(%rsi),%ymm13 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm0,%ymm14 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 32(r=%ymm14 +vpand 32(%rdx),%ymm13,%ymm14 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 64(r=%ymm14 +vpand 64(%rdx),%ymm13,%ymm14 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 96(r=%ymm14 +vpand 96(%rdx),%ymm13,%ymm14 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 128(r=%ymm14 +vpand 128(%rdx),%ymm13,%ymm14 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 160(r=%ymm14 +vpand 160(%rdx),%ymm13,%ymm14 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 192(r=%ymm14 +vpand 192(%rdx),%ymm13,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 224(r=%ymm14 +vpand 224(%rdx),%ymm13,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 256(r=%ymm14 +vpand 256(%rdx),%ymm13,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 288(r=%ymm14 +vpand 288(%rdx),%ymm13,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 320(r=%ymm14 +vpand 320(%rdx),%ymm13,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#14 +# asm 2: vpand 352(r=%ymm13 +vpand 352(%rdx),%ymm13,%ymm13 + +# qhasm: r12 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#14 +# asm 2: vmovupd 0(a0=%ymm13 +vmovupd 0(%rsi),%ymm13 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm13,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm13,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm13,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm13,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm13,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm13,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm13,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm13,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm13,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm13,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm13,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r11_stack=stack64#1 +# asm 2: movq r11_stack=608(%rsp) +movq %r11,608(%rsp) + +# qhasm: r12_stack = caller_r12 +# asm 1: movq r12_stack=stack64#2 +# asm 2: movq r12_stack=616(%rsp) +movq %r12,616(%rsp) + +# qhasm: r13_stack = caller_r13 +# asm 1: movq r13_stack=stack64#3 +# asm 2: movq r13_stack=624(%rsp) +movq %r13,624(%rsp) + +# qhasm: r14_stack = caller_r14 +# asm 1: movq r14_stack=stack64#4 +# asm 2: movq r14_stack=632(%rsp) +movq %r14,632(%rsp) + +# qhasm: r15_stack = caller_r15 +# asm 1: movq r15_stack=stack64#5 +# asm 2: movq r15_stack=640(%rsp) +movq %r15,640(%rsp) + +# qhasm: rbx_stack = caller_rbx +# asm 1: movq rbx_stack=stack64#6 +# asm 2: movq rbx_stack=648(%rsp) +movq %rbx,648(%rsp) + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#4 +# asm 2: leaq ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: s0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(s0=reg256#1 +# asm 2: vmovupd 0(s0=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: s1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(s1=reg256#2 +# asm 2: vmovupd 32(s1=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: s2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(s2=reg256#3 +# asm 2: vmovupd 64(s2=%ymm2 +vmovupd 64(%rsi),%ymm2 + +# qhasm: t0 = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(t0=reg256#4 +# asm 2: vmovupd 0(t0=%ymm3 +vmovupd 0(%rdx),%ymm3 + +# qhasm: t1 = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(t1=reg256#5 +# asm 2: vmovupd 32(t1=%ymm4 +vmovupd 32(%rdx),%ymm4 + +# qhasm: t2 = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(t2=reg256#6 +# asm 2: vmovupd 64(t2=%ymm5 +vmovupd 64(%rdx),%ymm5 + +# qhasm: a5[0,1,2,3] = s2[2,2,3,3] +# asm 1: vpermq $0xfa,a5=reg256#7 +# asm 2: vpermq $0xfa,a5=%ymm6 +vpermq $0xfa,%ymm2,%ymm6 + +# qhasm: b5[0,1,2,3] = t2[2,3,2,3] +# asm 1: vpermq $0xee,b5=reg256#8 +# asm 2: vpermq $0xee,b5=%ymm7 +vpermq $0xee,%ymm5,%ymm7 + +# qhasm: r10 = a5 & b5 +# asm 1: vpand r10=reg256#9 +# asm 2: vpand r10=%ymm8 +vpand %ymm6,%ymm7,%ymm8 + +# qhasm: mem256[ ptr + 320 ] = r10 +# asm 1: vmovupd b4=reg256#6 +# asm 2: vpermq $0x44,b4=%ymm5 +vpermq $0x44,%ymm5,%ymm5 + +# qhasm: r9 = a5 & b4 +# asm 1: vpand r9=reg256#9 +# asm 2: vpand r9=%ymm8 +vpand %ymm6,%ymm5,%ymm8 + +# qhasm: b3[0,1,2,3] = t1[2,3,2,3] +# asm 1: vpermq $0xee,b3=reg256#10 +# asm 2: vpermq $0xee,b3=%ymm9 +vpermq $0xee,%ymm4,%ymm9 + +# qhasm: r8 = a5 & b3 +# asm 1: vpand r8=reg256#11 +# asm 2: vpand r8=%ymm10 +vpand %ymm6,%ymm9,%ymm10 + +# qhasm: b2[0,1,2,3] = t1[0,1,0,1] +# asm 1: vpermq $0x44,b2=reg256#5 +# asm 2: vpermq $0x44,b2=%ymm4 +vpermq $0x44,%ymm4,%ymm4 + +# qhasm: r7 = a5 & b2 +# asm 1: vpand r7=reg256#12 +# asm 2: vpand r7=%ymm11 +vpand %ymm6,%ymm4,%ymm11 + +# qhasm: b1[0,1,2,3] = t0[2,3,2,3] +# asm 1: vpermq $0xee,b1=reg256#13 +# asm 2: vpermq $0xee,b1=%ymm12 +vpermq $0xee,%ymm3,%ymm12 + +# qhasm: r6 = a5 & b1 +# asm 1: vpand r6=reg256#14 +# asm 2: vpand r6=%ymm13 +vpand %ymm6,%ymm12,%ymm13 + +# qhasm: b0[0,1,2,3] = t0[0,1,0,1] +# asm 1: vpermq $0x44,b0=reg256#4 +# asm 2: vpermq $0x44,b0=%ymm3 +vpermq $0x44,%ymm3,%ymm3 + +# qhasm: r5 = a5 & b0 +# asm 1: vpand r5=reg256#7 +# asm 2: vpand r5=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: a4[0,1,2,3] = s2[0,0,1,1] +# asm 1: vpermq $0x50,a4=reg256#3 +# asm 2: vpermq $0x50,a4=%ymm2 +vpermq $0x50,%ymm2,%ymm2 + +# qhasm: r = a4 & b5 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm2,%ymm7,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm2,%ymm5,%ymm8 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm2,%ymm9,%ymm8 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm2,%ymm4,%ymm8 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm2,%ymm12,%ymm8 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#3 +# asm 2: vpand r4=%ymm2 +vpand %ymm2,%ymm3,%ymm2 + +# qhasm: a3[0,1,2,3] = s1[2,2,3,3] +# asm 1: vpermq $0xfa,a3=reg256#9 +# asm 2: vpermq $0xfa,a3=%ymm8 +vpermq $0xfa,%ymm1,%ymm8 + +# qhasm: r = a3 & b5 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm8,%ymm7,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm5,%ymm10 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm9,%ymm10 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm4,%ymm10 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm12,%ymm10 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#9 +# asm 2: vpand r3=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: a2[0,1,2,3] = s1[0,0,1,1] +# asm 1: vpermq $0x50,a2=reg256#2 +# asm 2: vpermq $0x50,a2=%ymm1 +vpermq $0x50,%ymm1,%ymm1 + +# qhasm: r = a2 & b5 +# asm 1: vpand r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm1,%ymm7,%ymm10 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm1,%ymm5,%ymm10 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm1,%ymm9,%ymm10 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm1,%ymm4,%ymm10 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm1,%ymm12,%ymm10 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#2 +# asm 2: vpand r2=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: a1[0,1,2,3] = s0[2,2,3,3] +# asm 1: vpermq $0xfa,a1=reg256#11 +# asm 2: vpermq $0xfa,a1=%ymm10 +vpermq $0xfa,%ymm0,%ymm10 + +# qhasm: r = a1 & b5 +# asm 1: vpand r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm10,%ymm7,%ymm11 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm10,%ymm5,%ymm11 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm10,%ymm9,%ymm11 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm10,%ymm4,%ymm11 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm10,%ymm12,%ymm11 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#11 +# asm 2: vpand r1=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: a0[0,1,2,3] = s0[0,0,1,1] +# asm 1: vpermq $0x50,a0=reg256#1 +# asm 2: vpermq $0x50,a0=%ymm0 +vpermq $0x50,%ymm0,%ymm0 + +# qhasm: r = a0 & b5 +# asm 1: vpand r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm0,%ymm7,%ymm7 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm0,%ymm5,%ymm5 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm0,%ymm9,%ymm5 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#5 +# asm 2: vpand r=%ymm4 +vpand %ymm0,%ymm4,%ymm4 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#5 +# asm 2: vpand r=%ymm4 +vpand %ymm0,%ymm12,%ymm4 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#1 +# asm 2: vpand r0=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: mem256[ ptr + 128 ] = r4 +# asm 1: vmovupd h22=int64#2 +# asm 2: movq 344(h22=%rsi +movq 344(%rcx),%rsi + +# qhasm: h13 = h22 +# asm 1: mov h13=int64#3 +# asm 2: mov h13=%rdx +mov %rsi,%rdx + +# qhasm: h10 = h22 +# asm 1: mov h10=int64#2 +# asm 2: mov h10=%rsi +mov %rsi,%rsi + +# qhasm: h21 = mem64[ ptr + 336 ] +# asm 1: movq 336(h21=int64#5 +# asm 2: movq 336(h21=%r8 +movq 336(%rcx),%r8 + +# qhasm: h21 ^= *(uint64 *) ( ptr + 328 ) +# asm 1: xorq 328(h12=int64#6 +# asm 2: mov h12=%r9 +mov %r8,%r9 + +# qhasm: h9 = h21 +# asm 1: mov h9=int64#5 +# asm 2: mov h9=%r8 +mov %r8,%r8 + +# qhasm: h20 = mem64[ ptr + 312 ] +# asm 1: movq 312(h20=int64#7 +# asm 2: movq 312(h20=%rax +movq 312(%rcx),%rax + +# qhasm: h20 ^= *(uint64 *) ( ptr + 320 ) +# asm 1: xorq 320(h11=int64#8 +# asm 2: mov h11=%r10 +mov %rax,%r10 + +# qhasm: h8 = h20 +# asm 1: mov h8=int64#7 +# asm 2: mov h8=%rax +mov %rax,%rax + +# qhasm: h19 = mem64[ ptr + 304 ] +# asm 1: movq 304(h19=int64#9 +# asm 2: movq 304(h19=%r11 +movq 304(%rcx),%r11 + +# qhasm: h19 ^= *(uint64 *) ( ptr + 296 ) +# asm 1: xorq 296(h7=int64#9 +# asm 2: mov h7=%r11 +mov %r11,%r11 + +# qhasm: h18 = mem64[ ptr + 280 ] +# asm 1: movq 280(h18=int64#10 +# asm 2: movq 280(h18=%r12 +movq 280(%rcx),%r12 + +# qhasm: h18 ^= *(uint64 *) ( ptr + 288 ) +# asm 1: xorq 288(h6=int64#10 +# asm 2: mov h6=%r12 +mov %r12,%r12 + +# qhasm: h17 = mem64[ ptr + 272 ] +# asm 1: movq 272(h17=int64#11 +# asm 2: movq 272(h17=%r13 +movq 272(%rcx),%r13 + +# qhasm: h17 ^= *(uint64 *) ( ptr + 264 ) +# asm 1: xorq 264(h5=int64#11 +# asm 2: mov h5=%r13 +mov %r13,%r13 + +# qhasm: h16 = mem64[ ptr + 248 ] +# asm 1: movq 248(h16=int64#12 +# asm 2: movq 248(h16=%r14 +movq 248(%rcx),%r14 + +# qhasm: h16 ^= *(uint64 *) ( ptr + 256 ) +# asm 1: xorq 256(h4=int64#12 +# asm 2: mov h4=%r14 +mov %r14,%r14 + +# qhasm: h15 = mem64[ ptr + 240 ] +# asm 1: movq 240(h15=int64#13 +# asm 2: movq 240(h15=%r15 +movq 240(%rcx),%r15 + +# qhasm: h15 ^= *(uint64 *) ( ptr + 232 ) +# asm 1: xorq 232(h3=int64#13 +# asm 2: mov h3=%r15 +mov %r15,%r15 + +# qhasm: h14 = mem64[ ptr + 216 ] +# asm 1: movq 216(h14=int64#14 +# asm 2: movq 216(h14=%rbx +movq 216(%rcx),%rbx + +# qhasm: h14 ^= *(uint64 *) ( ptr + 224 ) +# asm 1: xorq 224(h2=int64#14 +# asm 2: mov h2=%rbx +mov %rbx,%rbx + +# qhasm: h13 ^= *(uint64 *) ( ptr + 208 ) +# asm 1: xorq 208(h1=int64#3 +# asm 2: mov h1=%rdx +mov %rdx,%rdx + +# qhasm: h12 ^= *(uint64 *) ( ptr + 184 ) +# asm 1: xorq 184(h0=int64#6 +# asm 2: mov h0=%r9 +mov %r9,%r9 + +# qhasm: h11 ^= *(uint64 *) ( ptr + 176 ) +# asm 1: xorq 176(caller_r11=int64#9 +# asm 2: movq caller_r11=%r11 +movq 608(%rsp),%r11 + +# qhasm: caller_r12 = r12_stack +# asm 1: movq caller_r12=int64#10 +# asm 2: movq caller_r12=%r12 +movq 616(%rsp),%r12 + +# qhasm: caller_r13 = r13_stack +# asm 1: movq caller_r13=int64#11 +# asm 2: movq caller_r13=%r13 +movq 624(%rsp),%r13 + +# qhasm: caller_r14 = r14_stack +# asm 1: movq caller_r14=int64#12 +# asm 2: movq caller_r14=%r14 +movq 632(%rsp),%r14 + +# qhasm: caller_r15 = r15_stack +# asm 1: movq caller_r15=int64#13 +# asm 2: movq caller_r15=%r15 +movq 640(%rsp),%r15 + +# qhasm: caller_rbx = rbx_stack +# asm 1: movq caller_rbx=int64#14 +# asm 2: movq caller_rbx=%rbx +movq 648(%rsp),%rbx + +# qhasm: return +add %r11,%rsp +ret diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec_mul_sp_asm.S b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec_mul_sp_asm.S new file mode 100644 index 0000000000..fe493cdf09 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec_mul_sp_asm.S @@ -0,0 +1,1115 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 s0 + +# qhasm: reg256 s1 + +# qhasm: reg256 s2 + +# qhasm: reg256 s3 + +# qhasm: reg256 s4 + +# qhasm: reg256 s5 + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 b2 + +# qhasm: reg256 b3 + +# qhasm: reg256 b4 + +# qhasm: reg256 b5 + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r + +# qhasm: int64 h0 + +# qhasm: int64 h1 + +# qhasm: int64 h2 + +# qhasm: int64 h3 + +# qhasm: int64 h4 + +# qhasm: int64 h5 + +# qhasm: int64 h6 + +# qhasm: int64 h7 + +# qhasm: int64 h8 + +# qhasm: int64 h9 + +# qhasm: int64 h10 + +# qhasm: int64 h11 + +# qhasm: int64 h12 + +# qhasm: int64 h13 + +# qhasm: int64 h14 + +# qhasm: int64 h15 + +# qhasm: int64 h16 + +# qhasm: int64 h17 + +# qhasm: int64 h18 + +# qhasm: int64 h19 + +# qhasm: int64 h20 + +# qhasm: int64 h21 + +# qhasm: int64 h22 + +# qhasm: stack4864 buf + +# qhasm: int64 ptr + +# qhasm: int64 tmp + +# qhasm: stack64 r11_stack + +# qhasm: stack64 r12_stack + +# qhasm: stack64 r13_stack + +# qhasm: stack64 r14_stack + +# qhasm: stack64 r15_stack + +# qhasm: stack64 rbx_stack + +# qhasm: stack64 rbp_stack + +# qhasm: enter vec_mul_sp_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864F_AVX_vec_mul_sp_asm +.global PQCLEAN_MCELIECE348864F_AVX_vec_mul_sp_asm +_PQCLEAN_MCELIECE348864F_AVX_vec_mul_sp_asm: +PQCLEAN_MCELIECE348864F_AVX_vec_mul_sp_asm: +mov %rsp,%r11 +and $31,%r11 +add $672,%r11 +sub %r11,%rsp + +# qhasm: r11_stack = caller_r11 +# asm 1: movq r11_stack=stack64#1 +# asm 2: movq r11_stack=608(%rsp) +movq %r11,608(%rsp) + +# qhasm: r12_stack = caller_r12 +# asm 1: movq r12_stack=stack64#2 +# asm 2: movq r12_stack=616(%rsp) +movq %r12,616(%rsp) + +# qhasm: r13_stack = caller_r13 +# asm 1: movq r13_stack=stack64#3 +# asm 2: movq r13_stack=624(%rsp) +movq %r13,624(%rsp) + +# qhasm: r14_stack = caller_r14 +# asm 1: movq r14_stack=stack64#4 +# asm 2: movq r14_stack=632(%rsp) +movq %r14,632(%rsp) + +# qhasm: r15_stack = caller_r15 +# asm 1: movq r15_stack=stack64#5 +# asm 2: movq r15_stack=640(%rsp) +movq %r15,640(%rsp) + +# qhasm: rbx_stack = caller_rbx +# asm 1: movq rbx_stack=stack64#6 +# asm 2: movq rbx_stack=648(%rsp) +movq %rbx,648(%rsp) + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#4 +# asm 2: leaq ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: s0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(s0=reg256#1 +# asm 2: vmovupd 0(s0=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: s1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(s1=reg256#2 +# asm 2: vmovupd 32(s1=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: s2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(s2=reg256#3 +# asm 2: vmovupd 64(s2=%ymm2 +vmovupd 64(%rsi),%ymm2 + +# qhasm: a5[0,1,2,3] = s2[2,2,3,3] +# asm 1: vpermq $0xfa,a5=reg256#4 +# asm 2: vpermq $0xfa,a5=%ymm3 +vpermq $0xfa,%ymm2,%ymm3 + +# qhasm: r = mem256[ input_2 + 160 ] +# asm 1: vmovupd 160(r=reg256#5 +# asm 2: vmovupd 160(r=%ymm4 +vmovupd 160(%rdx),%ymm4 + +# qhasm: b5[0,1,2,3] = r[1,3,1,3] +# asm 1: vpermq $0xdd,b5=reg256#5 +# asm 2: vpermq $0xdd,b5=%ymm4 +vpermq $0xdd,%ymm4,%ymm4 + +# qhasm: r10 = a5 & b5 +# asm 1: vpand r10=reg256#6 +# asm 2: vpand r10=%ymm5 +vpand %ymm3,%ymm4,%ymm5 + +# qhasm: mem256[ ptr + 320 ] = r10 +# asm 1: vmovupd r=reg256#6 +# asm 2: vmovupd 128(r=%ymm5 +vmovupd 128(%rdx),%ymm5 + +# qhasm: b4[0,1,2,3] = r[1,3,1,3] +# asm 1: vpermq $0xdd,b4=reg256#6 +# asm 2: vpermq $0xdd,b4=%ymm5 +vpermq $0xdd,%ymm5,%ymm5 + +# qhasm: r9 = a5 & b4 +# asm 1: vpand r9=reg256#7 +# asm 2: vpand r9=%ymm6 +vpand %ymm3,%ymm5,%ymm6 + +# qhasm: r = mem256[ input_2 + 96 ] +# asm 1: vmovupd 96(r=reg256#8 +# asm 2: vmovupd 96(r=%ymm7 +vmovupd 96(%rdx),%ymm7 + +# qhasm: b3[0,1,2,3] = r[1,3,1,3] +# asm 1: vpermq $0xdd,b3=reg256#8 +# asm 2: vpermq $0xdd,b3=%ymm7 +vpermq $0xdd,%ymm7,%ymm7 + +# qhasm: r8 = a5 & b3 +# asm 1: vpand r8=reg256#9 +# asm 2: vpand r8=%ymm8 +vpand %ymm3,%ymm7,%ymm8 + +# qhasm: r = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(r=reg256#10 +# asm 2: vmovupd 64(r=%ymm9 +vmovupd 64(%rdx),%ymm9 + +# qhasm: b2[0,1,2,3] = r[1,3,1,3] +# asm 1: vpermq $0xdd,b2=reg256#10 +# asm 2: vpermq $0xdd,b2=%ymm9 +vpermq $0xdd,%ymm9,%ymm9 + +# qhasm: r7 = a5 & b2 +# asm 1: vpand r7=reg256#11 +# asm 2: vpand r7=%ymm10 +vpand %ymm3,%ymm9,%ymm10 + +# qhasm: r = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(r=reg256#12 +# asm 2: vmovupd 32(r=%ymm11 +vmovupd 32(%rdx),%ymm11 + +# qhasm: b1[0,1,2,3] = r[1,3,1,3] +# asm 1: vpermq $0xdd,b1=reg256#12 +# asm 2: vpermq $0xdd,b1=%ymm11 +vpermq $0xdd,%ymm11,%ymm11 + +# qhasm: r6 = a5 & b1 +# asm 1: vpand r6=reg256#13 +# asm 2: vpand r6=%ymm12 +vpand %ymm3,%ymm11,%ymm12 + +# qhasm: r = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(r=reg256#14 +# asm 2: vmovupd 0(r=%ymm13 +vmovupd 0(%rdx),%ymm13 + +# qhasm: b0[0,1,2,3] = r[1,3,1,3] +# asm 1: vpermq $0xdd,b0=reg256#14 +# asm 2: vpermq $0xdd,b0=%ymm13 +vpermq $0xdd,%ymm13,%ymm13 + +# qhasm: r5 = a5 & b0 +# asm 1: vpand r5=reg256#4 +# asm 2: vpand r5=%ymm3 +vpand %ymm3,%ymm13,%ymm3 + +# qhasm: a4[0,1,2,3] = s2[0,0,1,1] +# asm 1: vpermq $0x50,a4=reg256#3 +# asm 2: vpermq $0x50,a4=%ymm2 +vpermq $0x50,%ymm2,%ymm2 + +# qhasm: r = a4 & b5 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm2,%ymm4,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#7 +# asm 2: vpand r=%ymm6 +vpand %ymm2,%ymm5,%ymm6 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#7 +# asm 2: vpand r=%ymm6 +vpand %ymm2,%ymm7,%ymm6 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#7 +# asm 2: vpand r=%ymm6 +vpand %ymm2,%ymm9,%ymm6 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#7 +# asm 2: vpand r=%ymm6 +vpand %ymm2,%ymm11,%ymm6 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#3 +# asm 2: vpand r4=%ymm2 +vpand %ymm2,%ymm13,%ymm2 + +# qhasm: a3[0,1,2,3] = s1[2,2,3,3] +# asm 1: vpermq $0xfa,a3=reg256#7 +# asm 2: vpermq $0xfa,a3=%ymm6 +vpermq $0xfa,%ymm1,%ymm6 + +# qhasm: r = a3 & b5 +# asm 1: vpand r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm6,%ymm4,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm6,%ymm5,%ymm8 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm6,%ymm7,%ymm8 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm6,%ymm9,%ymm8 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm6,%ymm11,%ymm8 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vpand r3=%ymm6 +vpand %ymm6,%ymm13,%ymm6 + +# qhasm: a2[0,1,2,3] = s1[0,0,1,1] +# asm 1: vpermq $0x50,a2=reg256#2 +# asm 2: vpermq $0x50,a2=%ymm1 +vpermq $0x50,%ymm1,%ymm1 + +# qhasm: r = a2 & b5 +# asm 1: vpand r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm1,%ymm4,%ymm8 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm1,%ymm5,%ymm8 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm1,%ymm7,%ymm8 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm1,%ymm9,%ymm8 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#9 +# asm 2: vpand r=%ymm8 +vpand %ymm1,%ymm11,%ymm8 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#2 +# asm 2: vpand r2=%ymm1 +vpand %ymm1,%ymm13,%ymm1 + +# qhasm: a1[0,1,2,3] = s0[2,2,3,3] +# asm 1: vpermq $0xfa,a1=reg256#9 +# asm 2: vpermq $0xfa,a1=%ymm8 +vpermq $0xfa,%ymm0,%ymm8 + +# qhasm: r = a1 & b5 +# asm 1: vpand r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm4,%ymm10 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm5,%ymm10 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm7,%ymm10 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm9,%ymm10 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#11 +# asm 2: vpand r=%ymm10 +vpand %ymm8,%ymm11,%ymm10 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#9 +# asm 2: vpand r1=%ymm8 +vpand %ymm8,%ymm13,%ymm8 + +# qhasm: a0[0,1,2,3] = s0[0,0,1,1] +# asm 1: vpermq $0x50,a0=reg256#1 +# asm 2: vpermq $0x50,a0=%ymm0 +vpermq $0x50,%ymm0,%ymm0 + +# qhasm: r = a0 & b5 +# asm 1: vpand r=reg256#5 +# asm 2: vpand r=%ymm4 +vpand %ymm0,%ymm4,%ymm4 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm0,%ymm5,%ymm3 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm0,%ymm7,%ymm3 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm0,%ymm9,%ymm3 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm0,%ymm11,%ymm3 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#1 +# asm 2: vpand r0=%ymm0 +vpand %ymm0,%ymm13,%ymm0 + +# qhasm: mem256[ ptr + 128 ] = r4 +# asm 1: vmovupd h22=int64#2 +# asm 2: movq 344(h22=%rsi +movq 344(%rcx),%rsi + +# qhasm: h13 = h22 +# asm 1: mov h13=int64#3 +# asm 2: mov h13=%rdx +mov %rsi,%rdx + +# qhasm: h10 = h22 +# asm 1: mov h10=int64#2 +# asm 2: mov h10=%rsi +mov %rsi,%rsi + +# qhasm: h21 = mem64[ ptr + 336 ] +# asm 1: movq 336(h21=int64#5 +# asm 2: movq 336(h21=%r8 +movq 336(%rcx),%r8 + +# qhasm: h21 ^= *(uint64 *) ( ptr + 328 ) +# asm 1: xorq 328(h12=int64#6 +# asm 2: mov h12=%r9 +mov %r8,%r9 + +# qhasm: h9 = h21 +# asm 1: mov h9=int64#5 +# asm 2: mov h9=%r8 +mov %r8,%r8 + +# qhasm: h20 = mem64[ ptr + 312 ] +# asm 1: movq 312(h20=int64#7 +# asm 2: movq 312(h20=%rax +movq 312(%rcx),%rax + +# qhasm: h20 ^= *(uint64 *) ( ptr + 320 ) +# asm 1: xorq 320(h11=int64#8 +# asm 2: mov h11=%r10 +mov %rax,%r10 + +# qhasm: h8 = h20 +# asm 1: mov h8=int64#7 +# asm 2: mov h8=%rax +mov %rax,%rax + +# qhasm: h19 = mem64[ ptr + 304 ] +# asm 1: movq 304(h19=int64#9 +# asm 2: movq 304(h19=%r11 +movq 304(%rcx),%r11 + +# qhasm: h19 ^= *(uint64 *) ( ptr + 296 ) +# asm 1: xorq 296(h7=int64#9 +# asm 2: mov h7=%r11 +mov %r11,%r11 + +# qhasm: h18 = mem64[ ptr + 280 ] +# asm 1: movq 280(h18=int64#10 +# asm 2: movq 280(h18=%r12 +movq 280(%rcx),%r12 + +# qhasm: h18 ^= *(uint64 *) ( ptr + 288 ) +# asm 1: xorq 288(h6=int64#10 +# asm 2: mov h6=%r12 +mov %r12,%r12 + +# qhasm: h17 = mem64[ ptr + 272 ] +# asm 1: movq 272(h17=int64#11 +# asm 2: movq 272(h17=%r13 +movq 272(%rcx),%r13 + +# qhasm: h17 ^= *(uint64 *) ( ptr + 264 ) +# asm 1: xorq 264(h5=int64#11 +# asm 2: mov h5=%r13 +mov %r13,%r13 + +# qhasm: h16 = mem64[ ptr + 248 ] +# asm 1: movq 248(h16=int64#12 +# asm 2: movq 248(h16=%r14 +movq 248(%rcx),%r14 + +# qhasm: h16 ^= *(uint64 *) ( ptr + 256 ) +# asm 1: xorq 256(h4=int64#12 +# asm 2: mov h4=%r14 +mov %r14,%r14 + +# qhasm: h15 = mem64[ ptr + 240 ] +# asm 1: movq 240(h15=int64#13 +# asm 2: movq 240(h15=%r15 +movq 240(%rcx),%r15 + +# qhasm: h15 ^= *(uint64 *) ( ptr + 232 ) +# asm 1: xorq 232(h3=int64#13 +# asm 2: mov h3=%r15 +mov %r15,%r15 + +# qhasm: h14 = mem64[ ptr + 216 ] +# asm 1: movq 216(h14=int64#14 +# asm 2: movq 216(h14=%rbx +movq 216(%rcx),%rbx + +# qhasm: h14 ^= *(uint64 *) ( ptr + 224 ) +# asm 1: xorq 224(h2=int64#14 +# asm 2: mov h2=%rbx +mov %rbx,%rbx + +# qhasm: h13 ^= *(uint64 *) ( ptr + 208 ) +# asm 1: xorq 208(h1=int64#3 +# asm 2: mov h1=%rdx +mov %rdx,%rdx + +# qhasm: h12 ^= *(uint64 *) ( ptr + 184 ) +# asm 1: xorq 184(h0=int64#6 +# asm 2: mov h0=%r9 +mov %r9,%r9 + +# qhasm: h11 ^= *(uint64 *) ( ptr + 176 ) +# asm 1: xorq 176(caller_r11=int64#9 +# asm 2: movq caller_r11=%r11 +movq 608(%rsp),%r11 + +# qhasm: caller_r12 = r12_stack +# asm 1: movq caller_r12=int64#10 +# asm 2: movq caller_r12=%r12 +movq 616(%rsp),%r12 + +# qhasm: caller_r13 = r13_stack +# asm 1: movq caller_r13=int64#11 +# asm 2: movq caller_r13=%r13 +movq 624(%rsp),%r13 + +# qhasm: caller_r14 = r14_stack +# asm 1: movq caller_r14=int64#12 +# asm 2: movq caller_r14=%r14 +movq 632(%rsp),%r14 + +# qhasm: caller_r15 = r15_stack +# asm 1: movq caller_r15=int64#13 +# asm 2: movq caller_r15=%r15 +movq 640(%rsp),%r15 + +# qhasm: caller_rbx = rbx_stack +# asm 1: movq caller_rbx=int64#14 +# asm 2: movq caller_rbx=%rbx +movq 648(%rsp),%rbx + +# qhasm: return +add %r11,%rsp +ret diff --git a/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec_reduce_asm.S b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec_reduce_asm.S new file mode 100644 index 0000000000..58a4c72048 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece348864f_avx/vec_reduce_asm.S @@ -0,0 +1,356 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 t + +# qhasm: int64 c + +# qhasm: int64 r + +# qhasm: enter vec_reduce_asm +.p2align 5 +.global _PQCLEAN_MCELIECE348864F_AVX_vec_reduce_asm +.global PQCLEAN_MCELIECE348864F_AVX_vec_reduce_asm +_PQCLEAN_MCELIECE348864F_AVX_vec_reduce_asm: +PQCLEAN_MCELIECE348864F_AVX_vec_reduce_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: r = 0 +# asm 1: mov $0,>r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t = mem64[ input_0 + 88 ] +# asm 1: movq 88(t=int64#2 +# asm 2: movq 88(t=%rsi +movq 88(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 80(t=%rsi +movq 80(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 72(t=%rsi +movq 72(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 64(t=%rsi +movq 64(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 56(t=%rsi +movq 56(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 48(t=%rsi +movq 48(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 40(t=%rsi +movq 40(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 32(t=%rsi +movq 32(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 24(t=%rsi +movq 24(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 16(t=%rsi +movq 16(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#2 +# asm 2: movq 8(t=%rsi +movq 8(%rdi),%rsi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t=int64#1 +# asm 2: movq 0(t=%rdi +movq 0(%rdi),%rdi + +# qhasm: c = count(t) +# asm 1: popcnt c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rdi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE460896_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/api.h b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/api.h new file mode 100644 index 0000000000..9b7bff7789 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/api.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_API_H +#define PQCLEAN_MCELIECE460896_AVX_API_H + +#include + +#define PQCLEAN_MCELIECE460896_AVX_CRYPTO_ALGNAME "Classic McEliece 460896" +#define PQCLEAN_MCELIECE460896_AVX_CRYPTO_PUBLICKEYBYTES 524160 +#define PQCLEAN_MCELIECE460896_AVX_CRYPTO_SECRETKEYBYTES 13568 +#define PQCLEAN_MCELIECE460896_AVX_CRYPTO_CIPHERTEXTBYTES 188 +#define PQCLEAN_MCELIECE460896_AVX_CRYPTO_BYTES 32 + +int PQCLEAN_MCELIECE460896_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE460896_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE460896_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/benes.c b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/benes.c new file mode 100644 index 0000000000..64ef249a49 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/benes.c @@ -0,0 +1,311 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_x(vec128 *data, vec128 *bits) { + int i; + vec128 v0, v1; + vec128 d; + + for (i = 0; i < 64; i += 2) { + v0 = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(data[i + 0], data[i + 1]); + v1 = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_high(data[i + 0], data[i + 1]); + + d = PQCLEAN_MCELIECE460896_AVX_vec128_xor(v0, v1); + d = PQCLEAN_MCELIECE460896_AVX_vec128_and(d, *bits++); + v0 = PQCLEAN_MCELIECE460896_AVX_vec128_xor(v0, d); + v1 = PQCLEAN_MCELIECE460896_AVX_vec128_xor(v1, d); + + data[i + 0] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(v0, v1); + data[i + 1] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_high(v0, v1); + } +} + +static void layer_0(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x ], bs[ x + 1 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, *cond++); + bs[ x ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x ], diff); + bs[ x + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 1 ], diff); + } +} + +static void layer_1(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 2 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 3 ], diff); + + cond += 2; + } +} + +static void layer_2(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 4 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 4 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 5 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 5 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[2]); + bs[ x + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 2 ], diff); + bs[ x + 6 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 6 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[3]); + bs[ x + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 3 ], diff); + bs[ x + 7 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ x + 7 ], diff); + + cond += 4; + } +} + +static void layer_3(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 8 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 8 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 9 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 9 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 10 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 10 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 11 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 11 ], diff); + + cond += 4; + } + } +} + +static void layer_4(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 16 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 16 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 17 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 17 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 18 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 18 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 19 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 19 ], diff); + + cond += 4; + } + } +} + +static void layer_5(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 32 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 32 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 33 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 33 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 34 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 34 ], diff); + + diff = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); + diff = PQCLEAN_MCELIECE460896_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 35 ] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(bs[ s + 35 ], diff); + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: bits_int, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE460896_AVX_load_bits(vec128 bits_int[][32], const unsigned char *bits) { + int i, j; + const unsigned char *ptr = bits; + + vec128 buf[64]; + + for (i = 0; i <= 5; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE460896_AVX_vec128_set2x(PQCLEAN_MCELIECE460896_AVX_load8(ptr), PQCLEAN_MCELIECE460896_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE460896_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } + + for (i = 6; i <= 18; i++) { + for (j = 0; j < 32; j++) { + bits_int[i][j] = PQCLEAN_MCELIECE460896_AVX_load16(ptr); + ptr += 16; + } + } + + for (i = 19; i < 25; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE460896_AVX_vec128_set2x(PQCLEAN_MCELIECE460896_AVX_load8(ptr), PQCLEAN_MCELIECE460896_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE460896_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } +} + +/* input: r, sequence of bits to be permuted */ +/* b, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE460896_AVX_benes(vec128 *r, vec128 b[][32], int rev) { + int inc; + + vec128 *b_ptr; + + if (rev == 0) { + inc = 32; + b_ptr = b[ 0]; + } else { + inc = -32; + b_ptr = b[24]; + } + + // + + PQCLEAN_MCELIECE460896_AVX_transpose_64x128_sp( r ); + + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE460896_AVX_transpose_64x128_sp( r ); + + layer_x(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_x(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE460896_AVX_transpose_64x128_sp( r ); + + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + //b_ptr += inc; + + PQCLEAN_MCELIECE460896_AVX_transpose_64x128_sp( r ); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/benes.h b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/benes.h new file mode 100644 index 0000000000..5bb2798dd9 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_BENES_H +#define PQCLEAN_MCELIECE460896_AVX_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE460896_AVX_load_bits(vec128 /*bits_int*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE460896_AVX_benes(vec128 * /*r*/, vec128 /*b*/[][32], int /*rev*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/bm.c b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/bm.c new file mode 100644 index 0000000000..de2999f046 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/bm.c @@ -0,0 +1,215 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "params.h" +#include "vec128.h" + +#include + +extern gf PQCLEAN_MCELIECE460896_AVX_vec_reduce_asm(vec128 *); +extern void PQCLEAN_MCELIECE460896_AVX_update_asm(void *, gf, int); + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline void vec128_cmov(vec128 out[][2], uint16_t mask) { + int i; + + vec128 v0, v1; + + vec128 m0 = PQCLEAN_MCELIECE460896_AVX_vec128_set1_16b( mask); + vec128 m1 = PQCLEAN_MCELIECE460896_AVX_vec128_set1_16b(~mask); + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE460896_AVX_vec128_and(out[i][1], m0); + v1 = PQCLEAN_MCELIECE460896_AVX_vec128_and(out[i][0], m1); + out[i][0] = PQCLEAN_MCELIECE460896_AVX_vec128_or(v0, v1); + } +} + +static inline void interleave(vec256 *in, int idx0, int idx1, vec256 *mask, int b) { + int s = 1 << b; + + vec256 x, y; + + x = PQCLEAN_MCELIECE460896_AVX_vec256_or(PQCLEAN_MCELIECE460896_AVX_vec256_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE460896_AVX_vec256_sll_4x(PQCLEAN_MCELIECE460896_AVX_vec256_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE460896_AVX_vec256_or(PQCLEAN_MCELIECE460896_AVX_vec256_srl_4x(PQCLEAN_MCELIECE460896_AVX_vec256_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE460896_AVX_vec256_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec256 *in) { + int i, k; + + vec256 mask[4][2]; + vec256 buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE460896_AVX_vec256_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE460896_AVX_vec256_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE460896_AVX_vec256_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE460896_AVX_vec256_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE460896_AVX_vec256_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE460896_AVX_vec256_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE460896_AVX_vec256_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE460896_AVX_vec256_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE460896_AVX_vec256_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE460896_AVX_vec256_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE460896_AVX_vec256_extract(buf[i], 1) >> (k * 16)) & GFMASK; + out[ (4 * 2 + k) * 16 + i ] = (PQCLEAN_MCELIECE460896_AVX_vec256_extract(buf[i], 2) >> (k * 16)) & GFMASK; + out[ (4 * 3 + k) * 16 + i ] = (PQCLEAN_MCELIECE460896_AVX_vec256_extract(buf[i], 3) >> (k * 16)) & GFMASK; + } + } +} + +typedef union { + vec128 as_128[GFBITS][2]; + vec256 as_256[GFBITS]; +} aligned_double_vec128; + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE460896_AVX_bm(vec128 *out, vec256 *in) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1; + uint64_t v[2]; + + vec128 prod[ GFBITS ]; + vec128 interval[GFBITS]; + + aligned_double_vec128 db; + aligned_double_vec128 BC_tmp; + aligned_double_vec128 BC; + + gf d, b; + gf coefs[256]; + + // initialization + + get_coefs(coefs, in); + + BC.as_128[0][0] = PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0, one << 62); + BC.as_128[0][1] = PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0, one << 63); + + for (i = 1; i < GFBITS; i++) { + BC.as_128[i][0] = BC.as_128[i][1] = PQCLEAN_MCELIECE460896_AVX_vec128_setzero(); + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[i] = PQCLEAN_MCELIECE460896_AVX_vec128_setzero(); + } + + for (N = 0; N < SYS_T * 2; N++) { + PQCLEAN_MCELIECE460896_AVX_update_asm(interval, coefs[N], 16); + PQCLEAN_MCELIECE460896_AVX_vec128_mul_asm(prod, interval, BC.as_128[0] + 1, 32); + + d = PQCLEAN_MCELIECE460896_AVX_vec_reduce_asm(prod); + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + db.as_128[i][0] = PQCLEAN_MCELIECE460896_AVX_vec128_setbits((d >> i) & 1); + db.as_128[i][1] = PQCLEAN_MCELIECE460896_AVX_vec128_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE460896_AVX_vec256_mul(BC_tmp.as_256, db.as_256, BC.as_256); + + vec128_cmov(BC.as_128, mask); + PQCLEAN_MCELIECE460896_AVX_update_asm(BC.as_128, 0, 32); + + for (i = 0; i < GFBITS; i++) { + BC.as_128[i][1] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(BC_tmp.as_128[i][0], BC_tmp.as_128[i][1]); + } + + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE460896_AVX_vec128_extract(BC.as_128[i][1], 0); + v[1] = PQCLEAN_MCELIECE460896_AVX_vec128_extract(BC.as_128[i][1], 1); + + out[i] = PQCLEAN_MCELIECE460896_AVX_vec128_set2x((v[0] >> 31) | (v[1] << 33), v[1] >> 31); + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/bm.h b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/bm.h new file mode 100644 index 0000000000..4026125004 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/bm.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_BM_H +#define PQCLEAN_MCELIECE460896_AVX_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "vec128.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE460896_AVX_bm(vec128 * /*out*/, vec256 * /*in*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/consts.S b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/consts.S new file mode 100644 index 0000000000..de62f1b014 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/consts.S @@ -0,0 +1,33 @@ +.data + +# not supported on MacOS +#.section .rodata + +.globl PQCLEAN_MCELIECE460896_AVX_MASK0_0 +.globl PQCLEAN_MCELIECE460896_AVX_MASK0_1 +.globl PQCLEAN_MCELIECE460896_AVX_MASK1_0 +.globl PQCLEAN_MCELIECE460896_AVX_MASK1_1 +.globl PQCLEAN_MCELIECE460896_AVX_MASK2_0 +.globl PQCLEAN_MCELIECE460896_AVX_MASK2_1 +.globl PQCLEAN_MCELIECE460896_AVX_MASK3_0 +.globl PQCLEAN_MCELIECE460896_AVX_MASK3_1 +.globl PQCLEAN_MCELIECE460896_AVX_MASK4_0 +.globl PQCLEAN_MCELIECE460896_AVX_MASK4_1 +.globl PQCLEAN_MCELIECE460896_AVX_MASK5_0 +.globl PQCLEAN_MCELIECE460896_AVX_MASK5_1 + +.p2align 5 + +PQCLEAN_MCELIECE460896_AVX_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE460896_AVX_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE460896_AVX_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333, 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE460896_AVX_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE460896_AVX_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE460896_AVX_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE460896_AVX_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE460896_AVX_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE460896_AVX_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE460896_AVX_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE460896_AVX_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE460896_AVX_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/consts.inc b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/consts.inc new file mode 100644 index 0000000000..217965d3c6 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/consts.inc @@ -0,0 +1,502 @@ +//64 +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6666666666666666, 0X6666666666666666, 0X6666666666666666, 0X6666666666666666), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A) +}, +//128 +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33, 0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF) +}, +//256 +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0XF00F0FF00FF0F00F, 0XF00F0FF00FF0F00F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9669966969966996, 0X6996699696699669, 0X6996699696699669, 0X9669966969966996), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0X5AA5A55AA55A5AA5, 0X5AA5A55AA55A5AA5, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555) +}, +//512 +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9966669966999966, 0X6699996699666699, 0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9966996699669966, 0X6699669966996699, 0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6699996699666699, 0X9966669966999966, 0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6699669966996699, 0X9966996699669966, 0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +//1024 +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +//2048 +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +//4096 +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/controlbits.c b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/controlbits.c new file mode 100644 index 0000000000..f3cee38ae3 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE460896_AVX_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE460896_AVX_sort_63b(n / 2, x); + PQCLEAN_MCELIECE460896_AVX_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE460896_AVX_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/controlbits.h b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/controlbits.h new file mode 100644 index 0000000000..eb40ca1d33 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_CONTROLBITS_H +#define PQCLEAN_MCELIECE460896_AVX_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE460896_AVX_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE460896_AVX_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/crypto_hash.h b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/crypto_hash.h new file mode 100644 index 0000000000..439f460a10 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE460896_AVX_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/decrypt.c b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/decrypt.c new file mode 100644 index 0000000000..e1385f4445 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/decrypt.c @@ -0,0 +1,234 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec256 out[][GFBITS], vec256 inv[][GFBITS], const unsigned char *sk, vec256 *recv) { + int i, j; + + vec128 sk_int[ GFBITS ]; + vec256 eval[32][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + // computing inverses + + PQCLEAN_MCELIECE460896_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE460896_AVX_fft(eval, sk_int); + + for (i = 0; i < 32; i++) { + PQCLEAN_MCELIECE460896_AVX_vec256_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE460896_AVX_vec256_copy(inv[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE460896_AVX_vec256_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE460896_AVX_vec256_inv(tmp, inv[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE460896_AVX_vec256_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE460896_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE460896_AVX_vec256_copy(inv[0], tmp); + + // + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE460896_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + unsigned char r[ 1024 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 1024; i++) { + r[i] = 0; + } + + for (i = 0; i < 64; i++) { + recv[i] = PQCLEAN_MCELIECE460896_AVX_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 64; i++) { + v[0] = PQCLEAN_MCELIECE460896_AVX_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE460896_AVX_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE460896_AVX_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE460896_AVX_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec256 out[][GFBITS], vec256 inv[][GFBITS], vec256 *recv) { + int i, j; + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE460896_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 64; i++) { + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE460896_AVX_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE460896_AVX_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u64( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec256 *s0, vec256 *s1) { + int i; + vec256 diff; + + diff = PQCLEAN_MCELIECE460896_AVX_vec256_xor(s0[0], s1[0]); + + for (i = 1; i < GFBITS; i++) { + diff = PQCLEAN_MCELIECE460896_AVX_vec256_or(diff, PQCLEAN_MCELIECE460896_AVX_vec256_xor(s0[i], s1[i])); + } + + return (uint16_t)PQCLEAN_MCELIECE460896_AVX_vec256_testz(diff); +} + +static void reformat_128to256(vec256 *out, vec128 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE460896_AVX_vec128_extract(in[2 * i + 0], 0); + v[1] = PQCLEAN_MCELIECE460896_AVX_vec128_extract(in[2 * i + 0], 1); + v[2] = PQCLEAN_MCELIECE460896_AVX_vec128_extract(in[2 * i + 1], 0); + v[3] = PQCLEAN_MCELIECE460896_AVX_vec128_extract(in[2 * i + 1], 1); + + out[i] = PQCLEAN_MCELIECE460896_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +static void reformat_256to128(vec128 *out, vec256 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i], 3); + + out[2 * i + 0] = PQCLEAN_MCELIECE460896_AVX_vec128_set2x(v[0], v[1]); + out[2 * i + 1] = PQCLEAN_MCELIECE460896_AVX_vec128_set2x(v[2], v[3]); + } +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE460896_AVX_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec256 inv[ 32 ][ GFBITS ]; + vec256 scaled[ 32 ][ GFBITS ]; + vec256 eval[32][ GFBITS ]; + + vec128 error128[ 64 ]; + vec256 error256[ 32 ]; + + vec256 s_priv[ GFBITS ]; + vec256 s_priv_cmp[ GFBITS ]; + vec128 locator[ GFBITS ]; + + vec128 recv128[ 64 ]; + vec256 recv256[ 32 ]; + vec256 allone; + + vec128 bits_int[25][32]; + + // Berlekamp decoder + + preprocess(recv128, c); + + PQCLEAN_MCELIECE460896_AVX_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE460896_AVX_benes(recv128, bits_int, 1); + + reformat_128to256(recv256, recv128); + + scaling(scaled, inv, sk, recv256); + PQCLEAN_MCELIECE460896_AVX_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE460896_AVX_bm(locator, s_priv); + + PQCLEAN_MCELIECE460896_AVX_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE460896_AVX_vec256_set1_16b(0xFFFF); + + for (i = 0; i < 32; i++) { + error256[i] = PQCLEAN_MCELIECE460896_AVX_vec256_or_reduce(eval[i]); + error256[i] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(error256[i], allone); + } + + scaling_inv(scaled, inv, error256); + PQCLEAN_MCELIECE460896_AVX_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + reformat_256to128(error128, error256); + PQCLEAN_MCELIECE460896_AVX_benes(error128, bits_int, 0); + + postprocess(e, error128); + + check_weight = weight_check(e, error128); + + return 1 - (check_synd & check_weight); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/decrypt.h b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/decrypt.h new file mode 100644 index 0000000000..a84d461adb --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_DECRYPT_H +#define PQCLEAN_MCELIECE460896_AVX_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE460896_AVX_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/encrypt.c b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/encrypt.c new file mode 100644 index 0000000000..b7c6a960c1 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/encrypt.c @@ -0,0 +1,99 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "int32_sort.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE460896_AVX_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + int i, j, eq, count; + + uint16_t ind[ SYS_T * 2 ]; + int32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((uint8_t *) ind, sizeof(ind)); + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + PQCLEAN_MCELIECE460896_AVX_int32_sort(ind32, SYS_T); + + eq = 0; + for (i = 1; i < SYS_T; i++) { + if (ind32[i - 1] == ind32[i]) { + eq = 1; + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + PQCLEAN_MCELIECE460896_AVX_store8(e, e_int[i]); + e += 8; + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE460896_AVX_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + PQCLEAN_MCELIECE460896_AVX_syndrome_asm(s, pk, e); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/encrypt.h b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/encrypt.h new file mode 100644 index 0000000000..c223c3e93c --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_ENCRYPT_H +#define PQCLEAN_MCELIECE460896_AVX_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE460896_AVX_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/fft.c b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/fft.c new file mode 100644 index 0000000000..1deb9d6d4d --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/fft.c @@ -0,0 +1,262 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" +#include "vec256.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec128 *in) { + int i, j, k; + vec128 t; + uint64_t v0, v1; + + const vec128 mask[5][2] = { + { + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0x8888888888888888, 0x8888888888888888), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + } + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + v1 = PQCLEAN_MCELIECE460896_AVX_vec128_extract(in[i], 1); + v1 ^= v1 >> 32; + v0 = PQCLEAN_MCELIECE460896_AVX_vec128_extract(in[i], 0); + v0 ^= v1 << 32; + in[i] = PQCLEAN_MCELIECE460896_AVX_vec128_set2x(v0, v1); + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + t = PQCLEAN_MCELIECE460896_AVX_vec128_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE460896_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(in[i], t); + + t = PQCLEAN_MCELIECE460896_AVX_vec128_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE460896_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(in[i], t); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE460896_AVX_vec128_mul(in, in, s[j]); // scaling + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec256 out[][ GFBITS ], vec128 *in) { + int i, j, k, s, b; + + vec128 tmp[ GFBITS ]; + vec256 tmp0[ GFBITS ]; + vec256 tmp1[ GFBITS ]; + vec128 t[ GFBITS ]; + + union { + vec128 v[8][ GFBITS + 1 ]; + vec256 V[8][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + uint64_t v0, v1; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 2; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; + + // boradcast + + for (j = 0; j < GFBITS; j++) { + t[j] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_high(in[j], in[j]); + } + + for (i = 0; i < 8; i += 2) { + for (j = 0; j < GFBITS; j++) { + v0 = (beta[i + 0] >> j) & 1; + v0 = -v0; + v1 = (beta[i + 1] >> j) & 1; + v1 = -v1; + + tmp[j] = PQCLEAN_MCELIECE460896_AVX_vec128_set2x(v0, v1); + } + + PQCLEAN_MCELIECE460896_AVX_vec128_mul(tmp, t, tmp); + + for (j = 0; j < GFBITS; j++) { + pre.v[i + 0][j] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(tmp[j], tmp[j]); + pre.v[i + 1][j] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_high(tmp[j], tmp[j]); + } + } + + for (i = 0; i < GFBITS; i += 2) { + if (i != GFBITS - 1) { + buf.v[0][1] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(in[i + 1], in[i + 1] ^ pre.v[6][i + 1]); + } + buf.v[0][0] = PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(in[i + 0], in[i + 0] ^ pre.v[6][i + 0]); + + buf.V[1] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[0], pre.V[0][i / 2]); + buf.V[16] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[0], pre.V[4][i / 2]); + buf.V[3] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[1], pre.V[1][i / 2]); + buf.V[48] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[16], pre.V[5][i / 2]); + buf.V[49] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[48], pre.V[0][i / 2]); + buf.V[2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[0], pre.V[1][i / 2]); + buf.V[51] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[49], pre.V[1][i / 2]); + buf.V[6] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[2], pre.V[2][i / 2]); + buf.V[50] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[51], pre.V[0][i / 2]); + buf.V[7] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[6], pre.V[0][i / 2]); + buf.V[54] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[50], pre.V[2][i / 2]); + buf.V[5] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[7], pre.V[1][i / 2]); + buf.V[55] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[54], pre.V[0][i / 2]); + buf.V[53] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[55], pre.V[1][i / 2]); + buf.V[4] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[0], pre.V[2][i / 2]); + buf.V[52] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[53], pre.V[0][i / 2]); + buf.V[12] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[4], pre.V[3][i / 2]); + buf.V[60] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[52], pre.V[3][i / 2]); + buf.V[13] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[12], pre.V[0][i / 2]); + buf.V[61] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[60], pre.V[0][i / 2]); + buf.V[15] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[13], pre.V[1][i / 2]); + buf.V[63] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[61], pre.V[1][i / 2]); + buf.V[14] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[15], pre.V[0][i / 2]); + buf.V[62] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[63], pre.V[0][i / 2]); + buf.V[10] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[14], pre.V[2][i / 2]); + buf.V[58] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[62], pre.V[2][i / 2]); + buf.V[11] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[10], pre.V[0][i / 2]); + buf.V[59] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[58], pre.V[0][i / 2]); + buf.V[9] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[11], pre.V[1][i / 2]); + buf.V[57] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[59], pre.V[1][i / 2]); + buf.V[56] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[57], pre.V[0][i / 2]); + buf.V[8] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[0], pre.V[3][i / 2]); + buf.V[40] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[56], pre.V[4][i / 2]); + buf.V[24] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[8], pre.V[4][i / 2]); + buf.V[41] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[40], pre.V[0][i / 2]); + buf.V[25] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[24], pre.V[0][i / 2]); + buf.V[43] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[41], pre.V[1][i / 2]); + buf.V[27] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[25], pre.V[1][i / 2]); + buf.V[42] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[43], pre.V[0][i / 2]); + buf.V[26] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[27], pre.V[0][i / 2]); + buf.V[46] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[42], pre.V[2][i / 2]); + buf.V[30] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[26], pre.V[2][i / 2]); + buf.V[47] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[46], pre.V[0][i / 2]); + buf.V[31] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[30], pre.V[0][i / 2]); + buf.V[45] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[47], pre.V[1][i / 2]); + buf.V[29] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[31], pre.V[1][i / 2]); + buf.V[44] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[45], pre.V[0][i / 2]); + buf.V[28] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[29], pre.V[0][i / 2]); + buf.V[36] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[44], pre.V[3][i / 2]); + buf.V[20] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[28], pre.V[3][i / 2]); + buf.V[37] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[36], pre.V[0][i / 2]); + buf.V[21] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[20], pre.V[0][i / 2]); + buf.V[39] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[37], pre.V[1][i / 2]); + buf.V[23] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[21], pre.V[1][i / 2]); + buf.V[38] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[39], pre.V[0][i / 2]); + buf.V[22] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[23], pre.V[0][i / 2]); + buf.V[34] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[38], pre.V[2][i / 2]); + buf.V[18] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[22], pre.V[2][i / 2]); + buf.V[35] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[34], pre.V[0][i / 2]); + buf.V[19] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[18], pre.V[0][i / 2]); + buf.V[33] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[35], pre.V[1][i / 2]); + buf.V[17] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[19], pre.V[1][i / 2]); + buf.V[32] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[33], pre.V[0][i / 2]); + + // transpose + + PQCLEAN_MCELIECE460896_AVX_transpose_64x256_sp(buf.V); + + for (j = 0; j < 32; j++) { + if (i != GFBITS - 1) { + out[j][i + 1] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_high(buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + out[j][i + 0] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_low (buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + } + + // butterflies + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + tmp0[b] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_low (out[k][b], out[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + tmp1[b] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_high (out[k][b], out[k + 1][b]); + } + + PQCLEAN_MCELIECE460896_AVX_vec256_maa_asm(tmp0, tmp1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_low (tmp0[b], tmp1[b]); + } + for (b = 0; b < GFBITS; b++) { + out[k + 1][b] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_high (tmp0[b], tmp1[b]); + } + } + + for (i = 0; i <= 4; i++) { + s = 1 << i; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE460896_AVX_vec256_maa_asm(out[k], out[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + consts_ptr += (1 << i); + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE460896_AVX_fft(vec256 out[][GFBITS], vec128 *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/fft.h b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/fft.h new file mode 100644 index 0000000000..dc68724e20 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/fft.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_FFT_H +#define PQCLEAN_MCELIECE460896_AVX_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include "params.h" +#include "vec128.h" +#include "vec256.h" +#include + +void PQCLEAN_MCELIECE460896_AVX_fft(vec256 /*out*/[][GFBITS], vec128 * /*in*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/fft_tr.c b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/fft_tr.c new file mode 100644 index 0000000000..fe24bb9aa2 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/fft_tr.c @@ -0,0 +1,398 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec256 *in) { + int i, j, k; + vec256 t; + uint64_t v[4]; + + const vec256 mask[6][2] = { + { + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0x2222222222222222, 0x2222222222222222, 0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0x4444444444444444, 0x4444444444444444, 0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0x3030303030303030, 0x3030303030303030, 0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000) + }, + { + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF) + } + }; + + const vec256 s[6][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE460896_AVX_vec256_mul(in, in, s[j]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + t = PQCLEAN_MCELIECE460896_AVX_vec256_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE460896_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(in[i], t); + + t = PQCLEAN_MCELIECE460896_AVX_vec256_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE460896_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(in[i], t); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i], 3); + + v[1] ^= v[0] >> 32; + v[1] ^= v[1] << 32; + v[3] ^= v[2] >> 32; + v[3] ^= v[3] << 32; + + in[i] = PQCLEAN_MCELIECE460896_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i], 3); + + v[2] ^= v[1]; + v[3] ^= v[2]; + + in[i] = PQCLEAN_MCELIECE460896_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + + } +} + +static void butterflies_tr(vec256 *out, vec256 in[][ GFBITS ]) { + int i, j, k, s, b; + + vec256 t0[ GFBITS ]; + vec256 t1[ GFBITS ]; + vec256 t; + + vec128 out128[ GFBITS ][ 2 ]; + vec128 tmp[ GFBITS ]; + + union { + vec128 v[6][ GFBITS + 1 ]; + vec256 V[6][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t v[4]; + uint64_t consts_ptr = 33; + + const unsigned char reversal[] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // butterflies + + for (i = 4; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE460896_AVX_vec256_ama_asm(in[k], in[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + } + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_low(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_high(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE460896_AVX_vec256_ama_asm(t0, t1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + in[k][b] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_low(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_high(t0[b], t1[b]); + } + + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_low_2x(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_high_2x(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE460896_AVX_vec256_ama_asm(t0, t1, consts[0]); + + for (b = 0; b < GFBITS; b++) { + in[k + 0][b] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_low_2x(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE460896_AVX_vec256_unpack_high_2x(t0[b], t1[b]); + } + } + + + // boradcast + + for (i = 0; i < GFBITS; i += 2) { + // transpose + + for (k = 0; k < 32; k++) { + if (i != GFBITS - 1) { + buf.v[ reversal[2 * k + 0] ][1] = PQCLEAN_MCELIECE460896_AVX_vec256_extract2x(in[ k ][i + 1], 0); + buf.v[ reversal[2 * k + 1] ][1] = PQCLEAN_MCELIECE460896_AVX_vec256_extract2x(in[ k ][i + 1], 1); + } + + buf.v[ reversal[2 * k + 0] ][0] = PQCLEAN_MCELIECE460896_AVX_vec256_extract2x(in[ k ][i + 0], 0); + buf.v[ reversal[2 * k + 1] ][0] = PQCLEAN_MCELIECE460896_AVX_vec256_extract2x(in[ k ][i + 0], 1); + } + + PQCLEAN_MCELIECE460896_AVX_transpose_64x256_sp(buf.V); + + // + + pre.V[0][i / 2] = buf.V[32]; + buf.V[33] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[33], buf.V[32]); + pre.V[1][i / 2] = buf.V[33]; + buf.V[35] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[35], buf.V[33]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[35]); + buf.V[34] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[34], buf.V[35]); + pre.V[2][i / 2] = buf.V[34]; + buf.V[38] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[38], buf.V[34]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[38]); + buf.V[39] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[39], buf.V[38]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[39]); + buf.V[37] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[37], buf.V[39]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[37]); + buf.V[36] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[36], buf.V[37]); + pre.V[3][i / 2] = buf.V[36]; + buf.V[44] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[44], buf.V[36]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[44]); + buf.V[45] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[45], buf.V[44]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[45]); + buf.V[47] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[47], buf.V[45]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[47]); + buf.V[46] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[46], buf.V[47]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[2][i / 2], buf.V[46]); + buf.V[42] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[42], buf.V[46]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[42]); + buf.V[43] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[43], buf.V[42]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[43]); + buf.V[41] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[41], buf.V[43]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[41]); + buf.V[40] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[40], buf.V[41]); + pre.V[4][i / 2] = buf.V[40]; + buf.V[56] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[56], buf.V[40]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[56]); + buf.V[57] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[57], buf.V[56]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[57]); + buf.V[59] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[59], buf.V[57]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[59]); + buf.V[58] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[58], buf.V[59]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[2][i / 2], buf.V[58]); + buf.V[62] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[62], buf.V[58]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[62]); + buf.V[63] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[63], buf.V[62]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[63]); + buf.V[61] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[61], buf.V[63]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[61]); + buf.V[60] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[60], buf.V[61]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[3][i / 2], buf.V[60]); + buf.V[52] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[52], buf.V[60]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[52]); + buf.V[53] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[53], buf.V[52]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[53]); + buf.V[55] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[55], buf.V[53]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[55]); + buf.V[54] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[54], buf.V[55]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[2][i / 2], buf.V[54]); + buf.V[50] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[50], buf.V[54]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[50]); + buf.V[51] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[51], buf.V[50]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[51]); + buf.V[49] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[49], buf.V[51]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[49]); + buf.V[48] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[48], buf.V[49]); + pre.V[5][i / 2] = buf.V[48]; + buf.V[16] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[16], buf.V[48]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[16]); + buf.V[17] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[17], buf.V[16]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[17]); + buf.V[19] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[19], buf.V[17]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[19]); + buf.V[18] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[18], buf.V[19]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[2][i / 2], buf.V[18]); + buf.V[22] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[22], buf.V[18]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[22]); + buf.V[23] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[23], buf.V[22]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[23]); + buf.V[21] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[21], buf.V[23]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[21]); + buf.V[20] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[20], buf.V[21]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[3][i / 2], buf.V[20]); + buf.V[28] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[28], buf.V[20]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[28]); + buf.V[29] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[29], buf.V[28]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[29]); + buf.V[31] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[31], buf.V[29]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[31]); + buf.V[30] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[30], buf.V[31]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[2][i / 2], buf.V[30]); + buf.V[26] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[26], buf.V[30]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[26]); + buf.V[27] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[27], buf.V[26]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[27]); + buf.V[25] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[25], buf.V[27]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[25]); + buf.V[24] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[24], buf.V[25]); + pre.V[4][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[4][i / 2], buf.V[24]); + buf.V[8] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[8], buf.V[24]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[8]); + buf.V[9] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[9], buf.V[8]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[9]); + buf.V[11] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[11], buf.V[9]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[11]); + buf.V[10] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[10], buf.V[11]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[2][i / 2], buf.V[10]); + buf.V[14] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[14], buf.V[10]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[14]); + buf.V[15] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[15], buf.V[14]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[15]); + buf.V[13] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[13], buf.V[15]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[13]); + buf.V[12] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[12], buf.V[13]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[3][i / 2], buf.V[12]); + buf.V[4] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[4], buf.V[12]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[4]); + buf.V[5] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[5], buf.V[4]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[5]); + buf.V[7] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[7], buf.V[5]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[7]); + buf.V[6] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[6], buf.V[7]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[2][i / 2], buf.V[6]); + buf.V[2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[2], buf.V[6]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[2]); + buf.V[3] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[3], buf.V[2]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[1][i / 2], buf.V[3]); + buf.V[1] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[1], buf.V[3]); + + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896_AVX_vec256_xor(pre.V[0][i / 2], buf.V[1]); + t = PQCLEAN_MCELIECE460896_AVX_vec256_xor(buf.V[0], buf.V[1]); + + if (i != GFBITS - 1) { + out128[i + 1][0] = PQCLEAN_MCELIECE460896_AVX_vec256_extract2x(t, 1); + } + out128[i + 0][0] = PQCLEAN_MCELIECE460896_AVX_vec256_extract2x(t, 0); + + } + + // + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE460896_AVX_vec128_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE460896_AVX_vec128_mul(tmp, pre.v[0], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = tmp[b]; + } + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE460896_AVX_vec128_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE460896_AVX_vec128_mul(tmp, pre.v[i], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(out128[b][1], tmp[b]); + } + } + + for (b = 0; b < GFBITS; b++) { + v[0] = PQCLEAN_MCELIECE460896_AVX_vec128_extract(out128[b][0], 0); + v[1] = PQCLEAN_MCELIECE460896_AVX_vec128_extract(out128[b][0], 1); + v[2] = PQCLEAN_MCELIECE460896_AVX_vec128_extract(out128[b][1], 0); + v[3] = PQCLEAN_MCELIECE460896_AVX_vec128_extract(out128[b][1], 1); + + out[b] = PQCLEAN_MCELIECE460896_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +/* justifying the length of the output */ +static void postprocess(vec256 *out) { + int i; + uint64_t v[4]; + + for (i = 0; i < 13; i++) { + v[0] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(out[i], 0); + v[1] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(out[i], 1); + v[2] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(out[i], 2); + v[3] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(out[i], 3); + + v[3] = 0; + + out[i] = PQCLEAN_MCELIECE460896_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +void PQCLEAN_MCELIECE460896_AVX_fft_tr(vec256 *out, vec256 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); + + postprocess(out); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/fft_tr.h b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/fft_tr.h new file mode 100644 index 0000000000..2dd4743942 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_FFT_TR_H +#define PQCLEAN_MCELIECE460896_AVX_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE460896_AVX_fft_tr(vec256 * /*out*/, vec256 /*in*/[][ GFBITS ]); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/gf.c b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/gf.c new file mode 100644 index 0000000000..903fbab48e --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/gf.c @@ -0,0 +1,205 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE460896_AVX_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* field multiplication */ +gf PQCLEAN_MCELIECE460896_AVX_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* square twice */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE460896_AVX_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // ^11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // ^11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // ^111111111111 + + return gf_sqmul(out, num); // ^1111111111110 = ^-1 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE460896_AVX_gf_inv(gf in) { + return PQCLEAN_MCELIECE460896_AVX_gf_frac(in, ((gf) 1)); +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE460896_AVX_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[191]; + + for (i = 0; i < 191; i++) { + prod[i] = 0; + } + + for (i = 0; i < 96; i++) { + for (j = 0; j < 96; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE460896_AVX_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 190; i >= 96; i--) { + prod[i - 85] ^= PQCLEAN_MCELIECE460896_AVX_gf_mul(prod[i], (gf) 714); + prod[i - 91] ^= PQCLEAN_MCELIECE460896_AVX_gf_mul(prod[i], (gf) 5296); + prod[i - 92] ^= PQCLEAN_MCELIECE460896_AVX_gf_mul(prod[i], (gf) 728); + prod[i - 96] ^= PQCLEAN_MCELIECE460896_AVX_gf_mul(prod[i], (gf) 5881); + } + + for (i = 0; i < 96; i++) { + out[i] = prod[i]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/gf.h b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/gf.h new file mode 100644 index 0000000000..97f85d4fe1 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_GF_H +#define PQCLEAN_MCELIECE460896_AVX_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE460896_AVX_gf_iszero(gf a); +gf PQCLEAN_MCELIECE460896_AVX_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE460896_AVX_gf_mul(gf in0, gf in1); +uint64_t PQCLEAN_MCELIECE460896_AVX_gf_mul2(gf a, gf b0, gf b1); +gf PQCLEAN_MCELIECE460896_AVX_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE460896_AVX_gf_inv(gf in); + +void PQCLEAN_MCELIECE460896_AVX_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/int32_sort.c b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/int32_sort.c new file mode 100644 index 0000000000..aae917bd0c --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/int32_sort.c @@ -0,0 +1,1211 @@ +#include "int32_sort.h" +#define int32 int32_t + +#include + +typedef __m256i int32x8; +#define int32x8_load(z) _mm256_loadu_si256((__m256i_u *) (z)) +#define int32x8_store(z,i) _mm256_storeu_si256((__m256i_u *) (z),(i)) +#define int32x8_min _mm256_min_epi32 +#define int32x8_max _mm256_max_epi32 + +#define int32x8_MINMAX(a,b) \ + do { \ + int32x8 c = int32x8_min(a, b); \ + (b) = int32x8_max(a,b); \ + (a) = c; \ + } while(0) + +static inline void int32_MINMAX(int32 *a, int32 *b) { + int32 ab = *b ^ *a; + int32 c = *b - *a; + c ^= ab & (c ^ *b); + c >>= 31; + c &= ab; + *a ^= c; + *b ^= c; +} + +static void minmax_vector(int32 *x, int32 *y, size_t n) { + if (n < 8) { + while (n > 0) { + int32_MINMAX(x, y); + ++x; + ++y; + --n; + } + return; + } + if (n & 7) { + int32x8 x0 = int32x8_load(x + n - 8); + int32x8 y0 = int32x8_load(y + n - 8); + int32x8_MINMAX(x0, y0); + int32x8_store(x + n - 8, x0); + int32x8_store(y + n - 8, y0); + n &= ~7; + } + do { + int32x8 x0 = int32x8_load(x); + int32x8 y0 = int32x8_load(y); + int32x8_MINMAX(x0, y0); + int32x8_store(x, x0); + int32x8_store(y, y0); + x += 8; + y += 8; + n -= 8; + } while (n); +} + +/* stages 8,4,2,1 of size-16 bitonic merging */ +static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) { + int32x8 b0, b1, c0, c1, mask; + + int32x8_MINMAX(x0, x1); + + b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + + int32x8_MINMAX(b0, b1); + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */ + + x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */ + x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */ + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + } + + int32x8_store(&x[0], x0); + int32x8_store(&x[8], x1); +} + +/* stages 64,32 of bitonic merging; n is multiple of 128 */ +static void int32_twostages_32(int32 *x, size_t n) { + size_t i; + + while (n > 0) { + for (i = 0; i < 32; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + 32]); + int32x8 x2 = int32x8_load(&x[i + 64]); + int32x8 x3 = int32x8_load(&x[i + 96]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 32], x1); + int32x8_store(&x[i + 64], x2); + int32x8_store(&x[i + 96], x3); + } + x += 128; + n -= 128; + } +} + +/* stages 4q,2q,q of bitonic merging */ +static size_t int32_threestages(int32 *x, size_t n, size_t q) { + size_t k, i; + + for (k = 0; k + 8 * q <= n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + + return k; +} + +/* n is a power of 2; n >= 8; if n == 8 then flagdown */ +// NOLINTNEXTLINE(google-readability-function-size) +static void int32_sort_2power(int32 *x, size_t n, int flagdown) { + size_t p, q, i, j, k; + int32x8 mask; + + if (n == 8) { + int32 x0 = x[0]; + int32 x1 = x[1]; + int32 x2 = x[2]; + int32 x3 = x[3]; + int32 x4 = x[4]; + int32 x5 = x[5]; + int32 x6 = x[6]; + int32 x7 = x[7]; + + /* odd-even sort instead of bitonic sort */ + + int32_MINMAX(&x1, &x0); + int32_MINMAX(&x3, &x2); + int32_MINMAX(&x2, &x0); + int32_MINMAX(&x3, &x1); + int32_MINMAX(&x2, &x1); + + int32_MINMAX(&x5, &x4); + int32_MINMAX(&x7, &x6); + int32_MINMAX(&x6, &x4); + int32_MINMAX(&x7, &x5); + int32_MINMAX(&x6, &x5); + + int32_MINMAX(&x4, &x0); + int32_MINMAX(&x6, &x2); + int32_MINMAX(&x4, &x2); + + int32_MINMAX(&x5, &x1); + int32_MINMAX(&x7, &x3); + int32_MINMAX(&x5, &x3); + + int32_MINMAX(&x2, &x1); + int32_MINMAX(&x4, &x3); + int32_MINMAX(&x6, &x5); + + x[0] = x0; + x[1] = x1; + x[2] = x2; + x[3] = x3; + x[4] = x4; + x[5] = x5; + x[6] = x6; + x[7] = x7; + return; + } + + if (n == 16) { + int32x8 x0, x1, b0, b1, c0, c1; + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + + mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1); + + x0 ^= mask; /* A01234567 */ + x1 ^= mask; /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + c0 ^= mask; + c1 ^= mask; + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + b0 ^= mask; + b1 ^= mask; + + c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */ + c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */ + b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + mask = _mm256_set1_epi32(-1); + if (flagdown) { + x1 ^= mask; + } else { + x0 ^= mask; + } + + merge16_finish(x, x0, x1, flagdown); + return; + } + + if (n == 32) { + int32x8 x0, x1, x2, x3; + + int32_sort_2power(x, 16, 1); + int32_sort_2power(x + 16, 16, 0); + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + x2 = int32x8_load(&x[16]); + x3 = int32x8_load(&x[24]); + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + } + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + + merge16_finish(x, x0, x1, flagdown); + merge16_finish(x + 16, x2, x3, flagdown); + return; + } + + p = n >> 3; + for (i = 0; i < p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x2 = int32x8_load(&x[i + 2 * p]); + int32x8 x4 = int32x8_load(&x[i + 4 * p]); + int32x8 x6 = int32x8_load(&x[i + 6 * p]); + + /* odd-even stage instead of bitonic stage */ + + int32x8_MINMAX(x4, x0); + int32x8_MINMAX(x6, x2); + int32x8_MINMAX(x2, x0); + int32x8_MINMAX(x6, x4); + int32x8_MINMAX(x2, x4); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 2 * p], x2); + int32x8_store(&x[i + 4 * p], x4); + int32x8_store(&x[i + 6 * p], x6); + + int32x8 x1 = int32x8_load(&x[i + p]); + int32x8 x3 = int32x8_load(&x[i + 3 * p]); + int32x8 x5 = int32x8_load(&x[i + 5 * p]); + int32x8 x7 = int32x8_load(&x[i + 7 * p]); + + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x5, x3); + + int32x8_store(&x[i + p], x1); + int32x8_store(&x[i + 3 * p], x3); + int32x8_store(&x[i + 5 * p], x5); + int32x8_store(&x[i + 7 * p], x7); + } + + if (n >= 128) { + int flip, flipflip; + + mask = _mm256_set1_epi32(-1); + + for (j = 0; j < n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 16]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 16], x1); + } + + p = 8; + for (;;) { /* for p in [8, 16, ..., n/16] */ + q = p >> 1; + while (q >= 128) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + if (q == 64) { + int32_twostages_32(x, n); + q = 16; + } + if (q == 32) { + q = 8; + for (k = 0; k < n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q = 4; + } + if (q == 16) { + q = 8; + for (k = 0; k < n; k += 4 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q = 4; + } + if (q == 8) { + for (k = 0; k < n; k += q + q) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + } + } + + q = n >> 3; + flip = 0; + if (p << 1 == q) { + flip = 1; + } + flipflip = 1 - flip; + for (j = 0; j < q; j += p + p) { + for (k = j; k < j + p + p; k += p) { + for (i = k; i < k + p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + if (flip) { + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + x4 ^= mask; + x5 ^= mask; + x6 ^= mask; + x7 ^= mask; + } + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + flip ^= 1; + } + flip ^= flipflip; + } + + if (p << 4 == n) { + break; + } + p <<= 1; + } + } + + for (p = 4; p >= 1; p >>= 1) { + int32 *z = x; + int32 *target = x + n; + if (p == 4) { + mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&z[0], x0); + int32x8_store(&z[8], x1); + z += 16; + } + } else if (p == 2) { + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8_MINMAX(b0, b1); + int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20); + int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31); + int32x8_store(&z[0], c0); + int32x8_store(&z[8], c1); + z += 16; + } + } else { /* p == 1 */ + mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */ + int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */ + int32x8_MINMAX(d0, d1); + int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20); + int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31); + int32x8_store(&z[0], e0); + int32x8_store(&z[8], e1); + z += 16; + } + } + + q = n >> 4; + while (q >= 128 || q == 32) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (k = j; k < j + q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += 2 * q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (k = 0; k < q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + int32x8 x4 = int32x8_load(&x[k + 4 * q]); + int32x8 x5 = int32x8_load(&x[k + 5 * q]); + int32x8 x6 = int32x8_load(&x[k + 6 * q]); + int32x8 x7 = int32x8_load(&x[k + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + int32x8_store(&x[k + 4 * q], x4); + int32x8_store(&x[k + 5 * q], x5); + int32x8_store(&x[k + 6 * q], x6); + int32x8_store(&x[k + 7 * q], x7); + } + } + + /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */ + mask = _mm256_set1_epi32(-1); + + for (i = 0; i < n; i += 64) { + int32x8 a0 = int32x8_load(&x[i]); + int32x8 a1 = int32x8_load(&x[i + 8]); + int32x8 a2 = int32x8_load(&x[i + 16]); + int32x8 a3 = int32x8_load(&x[i + 24]); + int32x8 a4 = int32x8_load(&x[i + 32]); + int32x8 a5 = int32x8_load(&x[i + 40]); + int32x8 a6 = int32x8_load(&x[i + 48]); + int32x8 a7 = int32x8_load(&x[i + 56]); + + int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */ + int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */ + int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */ + int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */ + int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */ + int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */ + int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */ + + if (flagdown) { + c2 ^= mask; + c3 ^= mask; + c6 ^= mask; + c7 ^= mask; + } else { + c0 ^= mask; + c1 ^= mask; + c4 ^= mask; + c5 ^= mask; + } + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */ + int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */ + int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */ + + int32x8_MINMAX(d0, d1); + int32x8_MINMAX(d2, d3); + int32x8_MINMAX(d4, d5); + int32x8_MINMAX(d6, d7); + int32x8_MINMAX(d0, d2); + int32x8_MINMAX(d1, d3); + int32x8_MINMAX(d4, d6); + int32x8_MINMAX(d5, d7); + int32x8_MINMAX(d0, d4); + int32x8_MINMAX(d1, d5); + int32x8_MINMAX(d2, d6); + int32x8_MINMAX(d3, d7); + + int32x8 e0 = _mm256_unpacklo_epi32(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi32(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi32(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi32(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi32(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi32(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi32(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi32(d6, d7); + + int32x8 f0 = _mm256_unpacklo_epi64(e0, e2); + int32x8 f1 = _mm256_unpacklo_epi64(e1, e3); + int32x8 f2 = _mm256_unpackhi_epi64(e0, e2); + int32x8 f3 = _mm256_unpackhi_epi64(e1, e3); + int32x8 f4 = _mm256_unpacklo_epi64(e4, e6); + int32x8 f5 = _mm256_unpacklo_epi64(e5, e7); + int32x8 f6 = _mm256_unpackhi_epi64(e4, e6); + int32x8 f7 = _mm256_unpackhi_epi64(e5, e7); + + int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20); + int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20); + int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20); + int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20); + int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31); + int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31); + int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31); + int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31); + + int32x8_store(&x[i], g0); + int32x8_store(&x[i + 8], g1); + int32x8_store(&x[i + 16], g2); + int32x8_store(&x[i + 24], g3); + int32x8_store(&x[i + 32], g4); + int32x8_store(&x[i + 40], g5); + int32x8_store(&x[i + 48], g6); + int32x8_store(&x[i + 56], g7); + } + + q = n >> 4; + while (q >= 128 || q == 32) { + q >>= 2; + for (j = 0; j < n; j += 8 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q >>= 1; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += q + q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (i = 0; i < q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */ + int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */ + int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */ + int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */ + int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */ + int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */ + int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */ + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */ + int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */ + int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */ + + if (flagdown) { + d0 ^= mask; + d1 ^= mask; + d2 ^= mask; + d3 ^= mask; + d4 ^= mask; + d5 ^= mask; + d6 ^= mask; + d7 ^= mask; + } + + int32x8_store(&x[i], d0); + int32x8_store(&x[i + q], d4); + int32x8_store(&x[i + 2 * q], d1); + int32x8_store(&x[i + 3 * q], d5); + int32x8_store(&x[i + 4 * q], d2); + int32x8_store(&x[i + 5 * q], d6); + int32x8_store(&x[i + 6 * q], d3); + int32x8_store(&x[i + 7 * q], d7); + } +} + +void PQCLEAN_MCELIECE460896_AVX_int32_sort(int32 *x, size_t n) { + size_t q, i, j; + + if (n <= 8) { + if (n == 8) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + int32_MINMAX(&x[6], &x[7]); + } + if (n >= 7) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + } + if (n >= 6) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + } + if (n >= 5) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + } + if (n >= 4) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + } + if (n >= 3) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + } + if (n >= 2) { + int32_MINMAX(&x[0], &x[1]); + } + return; + } + + if (!(n & (n - 1))) { + int32_sort_2power(x, n, 0); + return; + } + + q = 8; + while (q < n - q) { + q += q; + } + /* n > q >= 8 */ + + if (q <= 128) { /* n <= 256 */ + int32x8 y[32]; + for (i = q >> 3; i < q >> 2; ++i) { + y[i] = _mm256_set1_epi32(0x7fffffff); + } + for (i = 0; i < n; ++i) { + ((int32 *)y)[i] = x[i]; + } + int32_sort_2power((int32 *) y, 2 * q, 0); + for (i = 0; i < n; ++i) { + x[i] = ((int32 *) y)[i]; + } + return; + } + + int32_sort_2power(x, q, 1); + PQCLEAN_MCELIECE460896_AVX_int32_sort(x + q, n - q); + + while (q >= 64) { + q >>= 2; + j = int32_threestages(x, n, q); + minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j); + if (j + 4 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + j += 4 * q; + } + minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j); + if (j + 2 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + } + j += 2 * q; + } + minmax_vector(x + j, x + j + q, n - q - j); + q >>= 1; + } + if (q == 32) { + j = 0; + for (; j + 64 <= n; j += 64) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8 x4 = int32x8_load(&x[j + 32]); + int32x8 x5 = int32x8_load(&x[j + 40]); + int32x8 x6 = int32x8_load(&x[j + 48]); + int32x8 x7 = int32x8_load(&x[j + 56]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20); + int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31); + int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20); + int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8_MINMAX(a4, a5); + int32x8_MINMAX(a6, a7); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20); + int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31); + int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20); + int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8 c4 = _mm256_unpacklo_epi64(b4, b5); + int32x8 c5 = _mm256_unpackhi_epi64(b4, b5); + int32x8 c6 = _mm256_unpacklo_epi64(b6, b7); + int32x8 c7 = _mm256_unpackhi_epi64(b6, b7); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8_MINMAX(c4, c5); + int32x8_MINMAX(c6, c7); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 d4 = _mm256_unpacklo_epi32(c4, c5); + int32x8 d5 = _mm256_unpackhi_epi32(c4, c5); + int32x8 d6 = _mm256_unpacklo_epi32(c6, c7); + int32x8 d7 = _mm256_unpackhi_epi32(c6, c7); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi64(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi64(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi64(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi64(d6, d7); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8_MINMAX(e4, e5); + int32x8_MINMAX(e6, e7); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8 f4 = _mm256_unpacklo_epi32(e4, e5); + int32x8 f5 = _mm256_unpackhi_epi32(e4, e5); + int32x8 f6 = _mm256_unpacklo_epi32(e6, e7); + int32x8 f7 = _mm256_unpackhi_epi32(e6, e7); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + int32x8_store(&x[j + 32], f4); + int32x8_store(&x[j + 40], f5); + int32x8_store(&x[j + 48], f6); + int32x8_store(&x[j + 56], f7); + } + minmax_vector(x + j, x + j + 32, n - 32 - j); + goto continue16; + } + if (q == 16) { + j = 0; +continue16: + for (; j + 32 <= n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + } + minmax_vector(x + j, x + j + 16, n - 16 - j); + goto continue8; + } + /* q == 8 */ + j = 0; +continue8: + for (; j + 16 <= n; j += 16) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 8], x1); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */ + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */ + int32x8_MINMAX(a0, a1); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */ + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */ + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */ + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */ + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */ + int32x8_MINMAX(e0, e1); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */ + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */ + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + } + minmax_vector(x + j, x + j + 8, n - 8 - j); + if (j + 8 <= n) { + int32_MINMAX(&x[j], &x[j + 4]); + int32_MINMAX(&x[j + 1], &x[j + 5]); + int32_MINMAX(&x[j + 2], &x[j + 6]); + int32_MINMAX(&x[j + 3], &x[j + 7]); + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + int32_MINMAX(&x[j + 4], &x[j + 6]); + int32_MINMAX(&x[j + 5], &x[j + 7]); + int32_MINMAX(&x[j + 4], &x[j + 5]); + int32_MINMAX(&x[j + 6], &x[j + 7]); + j += 8; + } + minmax_vector(x + j, x + j + 4, n - 4 - j); + if (j + 4 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + j += 4; + } + if (j + 3 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + } + if (j + 2 <= n) { + int32_MINMAX(&x[j], &x[j + 1]); + } +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/int32_sort.h b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/int32_sort.h new file mode 100644 index 0000000000..91832cd651 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/int32_sort.h @@ -0,0 +1,9 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_INT32_SORT_H +#define PQCLEAN_MCELIECE460896_AVX_INT32_SORT_H + +#include +#include + +void PQCLEAN_MCELIECE460896_AVX_int32_sort(int32_t *x, size_t n); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/operations.c b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/operations.c new file mode 100644 index 0000000000..bb6da6d63d --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE460896_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE460896_AVX_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE460896_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE460896_AVX_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE460896_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE460896_AVX_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE460896_AVX_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE460896_AVX_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE460896_AVX_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE460896_AVX_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE460896_AVX_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE460896_AVX_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE460896_AVX_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/params.h b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/params.h new file mode 100644 index 0000000000..33cb311b04 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_PARAMS_H +#define PQCLEAN_MCELIECE460896_AVX_PARAMS_H + +#define GFBITS 13 +#define SYS_N 4608 +#define SYS_T 96 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/pk_gen.c b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/pk_gen.c new file mode 100644 index 0000000000..80bb5df025 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/pk_gen.c @@ -0,0 +1,290 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +static void de_bitslicing(uint64_t *out, vec256 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 256 + 0 * 64 + r] <<= 1; + out[i * 256 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 256 + 1 * 64 + r] <<= 1; + out[i * 256 + 1 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i][j], 2); + for (r = 0; r < 64; r++) { + out[i * 256 + 2 * 64 + r] <<= 1; + out[i * 256 + 2 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE460896_AVX_vec256_extract(in[i][j], 3); + for (r = 0; r < 64; r++) { + out[i * 256 + 3 * 64 + r] <<= 1; + out[i * 256 + 3 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec256 out0[][GFBITS], vec256 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[4] = {0}; + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE460896_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE460896_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + } +} + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 255) / 256) +#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) +#define NBLOCKS2_I ((GFBITS * SYS_T + 255) / 256) +int PQCLEAN_MCELIECE460896_AVX_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + const int block_idx = NBLOCKS1_I - 1; + int tail = (GFBITS * SYS_T) % 64; + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 4 ]; + uint64_t ops[ GFBITS * SYS_T ][ NBLOCKS1_I ]; + + uint64_t mask; + + vec128 sk_int[ GFBITS ]; + + vec256 consts[ 32 ][ GFBITS ]; + vec256 eval[ 32 ][ GFBITS ]; + vec256 prod[ 32 ][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + uint64_t one_row[ 128 ]; + + // compute the inverses + + PQCLEAN_MCELIECE460896_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE460896_AVX_fft(eval, sk_int); + + PQCLEAN_MCELIECE460896_AVX_vec256_copy(prod[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE460896_AVX_vec256_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE460896_AVX_vec256_inv(tmp, prod[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE460896_AVX_vec256_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE460896_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE460896_AVX_vec256_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE460896_AVX_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_I; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_I; j++) { + PQCLEAN_MCELIECE460896_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + // gaussian elimination to obtain an upper triangular matrix + // and keep track of the operations in ops + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < NBLOCKS1_I; j++) { + ops[ i ][ j ] = 0; + } + } + + for (i = 0; i < PK_NROWS; i++) { + ops[ i ][ i / 64 ] = 1; + ops[ i ][ i / 64 ] <<= (i % 64); + } + + uint64_t column[ PK_NROWS ]; + + for (i = 0; i < PK_NROWS; i++) { + column[i] = mat[ i ][ block_idx ]; + } + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_I; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + ops[ row ][ c ] ^= ops[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_I; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // computing the lineaer map required to obatin the systematic form + + for (row = PK_NROWS - 1; row >= 0; row--) { + for (k = 0; k < row; k++) { + mask = mat[ k ][ row / 64 ] >> (row & 63); + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_I; c++) { + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // apply the linear map to the non-systematic part + + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE460896_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE460896_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + + for (i = 0; i < PK_NROWS; i++) { + mat[ i ][ block_idx ] = column[i]; + } + + for (row = 0; row < PK_NROWS; row++) { + for (k = 0; k < NBLOCKS1_H; k++) { + one_row[ k ] = 0; + } + + for (c = 0; c < PK_NROWS; c++) { + mask = ops[ row ][ c >> 6 ] >> (c & 63); + mask &= 1; + mask = -mask; + + for (k = block_idx; k < NBLOCKS1_H; k++) { + one_row[ k ] ^= mat[ c ][ k ] & mask; + } + } + + for (k = block_idx; k < NBLOCKS1_H - 1; k++) { + one_row[k] = (one_row[k] >> tail) | (one_row[k + 1] << (64 - tail)); + PQCLEAN_MCELIECE460896_AVX_store8(pk, one_row[k]); + pk += 8; + } + + one_row[k] >>= tail; + PQCLEAN_MCELIECE460896_AVX_store_i(pk, one_row[k], PK_ROW_BYTES % 8); + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/pk_gen.h b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/pk_gen.h new file mode 100644 index 0000000000..a43211f12d --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_PK_GEN_H +#define PQCLEAN_MCELIECE460896_AVX_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE460896_AVX_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/scalars_2x.inc b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/scalars_2x.inc new file mode 100644 index 0000000000..78f194eaec --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/scalars_2x.inc @@ -0,0 +1,75 @@ +{ + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000) +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/scalars_4x.inc b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/scalars_4x.inc new file mode 100644 index 0000000000..47155d661f --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/scalars_4x.inc @@ -0,0 +1,91 @@ +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330, 0XF0F30C33CF03F03F, 0X3F30CC0C000F3FCC), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C, 0X00F30FC00C3300FF, 0XFC3CF030FC3FFF03), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC, 0XF3CC3CF3F3FCF33F, 0X33FFFCFF0CCF3CC3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30, 0X3C0FC0FC303C3F3C, 0X003CFF33C3CC30CF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC, 0XFC30CF303F3FF00F, 0XCFF3CF33C00F3003), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F, 0X33300C0CC3300CF3, 0X00F3CC0CF3003CCF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF, 0X3C030CF3F03FF3F3, 0X3C000CFCCC3C3333), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3, 0X3CCC03FCCC3FFC03, 0XF3CF03C0FCF03FF0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030, 0X033C3C3CF0003FC3, 0X3F3C3CF0C330330C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC, 0XFFC0FF00F0FF0F03, 0X33CCFCC0FF0033F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3, 0XF3F30CF003FCC303, 0X33C300C0F0C003F3), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF, 0X30CFCFC3CC0F3000, 0X003FF0003F00C00C), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33, 0X0CF30CCF3FCFCC0F, 0XCFF3C3033F030FFF) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00, 0X0F0F00FF0FF0FFFF, 0XFF0F0F00F000F0FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF, 0XF000F0F00F00FF0F, 0X0FFFFFFFFF00000F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF, 0X000FFFF0FFF0FF0F, 0XF0FFFF000F00F0FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F, 0X00F00FFF00000FF0, 0X0F0000F00FFF0FFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFF0000F0FF000F0, 0XFF000000FFF00000, 0XFFFFF0000FFFF00F, 0X0F0F0F00FF0F000F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0, 0XFFF0FFF0000FFFF0, 0X000F0F0FFFF0F000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F, 0XF0F0F0000F0F0F00, 0XF0FFFF0F00F0FF0F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000, 0X00F000F0F00FFF00, 0X0F0F000F0F00F0FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00F000F0FFF00F00, 0X00F00F00F00F000F, 0XF0FF0F0FFF00F0FF, 0X0000F0FF00FF0F0F), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0, 0XF0FF0FFFF0F0F0FF, 0X00FFFF0FF0FFF0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF, 0X00FFFFFFFFFFFFF0, 0X0000000F00F0FFF0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF, 0X00FFF0F0FF000F0F, 0XF0F00000FF00F0F0), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00, 0X000FFFF0000FFF00, 0X0F0F0FFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00FF0000000000FF, 0X00FF00FF00FF0000, 0XFFFF00FF00FF00FF, 0XFF0000FFFFFF00FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF, 0X00FFFF000000FF00, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FF00FF0000, 0X0000FFFF000000FF, 0XFFFF00FFFFFFFF00, 0XFFFF000000FFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF000000FF0000, 0X00FFFF00FF000000, 0X0000FFFF00FFFFFF, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF, 0X00FF0000FF0000FF, 0XFFFFFF00FFFFFF00), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00, 0XFFFF0000FF00FFFF, 0X00FFFF00FFFF00FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00, 0XFF000000FFFFFF00, 0X0000FFFF00FF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF, 0X000000000000FFFF, 0X000000FFFF000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00FFFFFFFF00FF00, 0X0000FF0000000000, 0XFF00FF00FFFF0000, 0XFF00FF0000FF00FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF000000000000, 0XFF00FFFF00FFFF00, 0XFFFF00FFFF00FFFF, 0X00FF0000000000FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00FF0000FF000000, 0X00FFFF00000000FF, 0XFFFFFFFFFF00FF00, 0XFF00FFFF00FF00FF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF, 0XFFFF00FFFF0000FF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000, 0X0000FF00000000FF, 0X0000FF000000FFFF) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X000000000000FFFF, 0X0000FFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000FFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF, 0XFFFFFFFF00000000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFF000000000000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000000000000000, 0X0000FFFF00000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000, 0X00000000FFFF0000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFF0000FFFF0000, 0X0000FFFFFFFFFFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0X0000FFFFFFFFFFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFFFFFF00000000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF, 0X000000000000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF000000000000, 0X00000000FFFF0000, 0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFFFFFF0000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFF0000FFFF, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFF0000FFFFFFFF, 0X0000FFFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000) +}, +{ + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF) +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/sk_gen.c b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/sk_gen.c new file mode 100644 index 0000000000..3670a34def --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE460896_AVX_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE460896_AVX_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE460896_AVX_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE460896_AVX_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE460896_AVX_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE460896_AVX_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE460896_AVX_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE460896_AVX_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/sk_gen.h b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/sk_gen.h new file mode 100644 index 0000000000..8bed1da44a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_SK_GEN_H +#define PQCLEAN_MCELIECE460896_AVX_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE460896_AVX_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE460896_AVX_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/syndrome_asm.S b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/syndrome_asm.S new file mode 100644 index 0000000000..88ceaadcb0 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/syndrome_asm.S @@ -0,0 +1,650 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: reg256 pp + +# qhasm: reg256 ee + +# qhasm: reg256 ss + +# qhasm: int64 buf_ptr + +# qhasm: stack256 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE460896_AVX_syndrome_asm +.global PQCLEAN_MCELIECE460896_AVX_syndrome_asm +_PQCLEAN_MCELIECE460896_AVX_syndrome_asm: +PQCLEAN_MCELIECE460896_AVX_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 523740 +# asm 1: add $523740,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 1248 +# asm 1: mov $1248,>row=int64#5 +# asm 2: mov $1248,>row=%r8 +mov $1248,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 156 ] +# asm 1: vmovupd 156(ee=reg256#2 +# asm 2: vmovupd 156(ee=%ymm1 +vmovupd 156(%rdx),%ymm1 + +# qhasm: ss &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 32(pp=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 188 ] +# asm 1: vmovupd 188(ee=reg256#3 +# asm 2: vmovupd 188(ee=%ymm2 +vmovupd 188(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 64(pp=%ymm1 +vmovupd 64(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 220 ] +# asm 1: vmovupd 220(ee=reg256#3 +# asm 2: vmovupd 220(ee=%ymm2 +vmovupd 220(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 96(pp=%ymm1 +vmovupd 96(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 252 ] +# asm 1: vmovupd 252(ee=reg256#3 +# asm 2: vmovupd 252(ee=%ymm2 +vmovupd 252(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 128(pp=%ymm1 +vmovupd 128(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 284 ] +# asm 1: vmovupd 284(ee=reg256#3 +# asm 2: vmovupd 284(ee=%ymm2 +vmovupd 284(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 160(pp=%ymm1 +vmovupd 160(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 316 ] +# asm 1: vmovupd 316(ee=reg256#3 +# asm 2: vmovupd 316(ee=%ymm2 +vmovupd 316(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 192(pp=%ymm1 +vmovupd 192(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 348 ] +# asm 1: vmovupd 348(ee=reg256#3 +# asm 2: vmovupd 348(ee=%ymm2 +vmovupd 348(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 224(pp=%ymm1 +vmovupd 224(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 380 ] +# asm 1: vmovupd 380(ee=reg256#3 +# asm 2: vmovupd 380(ee=%ymm2 +vmovupd 380(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 256(pp=%ymm1 +vmovupd 256(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 412 ] +# asm 1: vmovupd 412(ee=reg256#3 +# asm 2: vmovupd 412(ee=%ymm2 +vmovupd 412(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 288(pp=%ymm1 +vmovupd 288(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 444 ] +# asm 1: vmovupd 444(ee=reg256#3 +# asm 2: vmovupd 444(ee=%ymm2 +vmovupd 444(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 320(pp=%ymm1 +vmovupd 320(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 476 ] +# asm 1: vmovupd 476(ee=reg256#3 +# asm 2: vmovupd 476(ee=%ymm2 +vmovupd 476(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 352(pp=%ymm1 +vmovupd 352(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 508 ] +# asm 1: vmovupd 508(ee=reg256#3 +# asm 2: vmovupd 508(ee=%ymm2 +vmovupd 508(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 384(pp=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 540 ] +# asm 1: vmovupd 540(ee=reg256#3 +# asm 2: vmovupd 540(ee=%ymm2 +vmovupd 540(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand buf=stack256#1 +# asm 2: vmovapd buf=0(%rsp) +vmovapd %ymm0,0(%rsp) + +# qhasm: s = *(uint32 *)(input_1 + 416) +# asm 1: movl 416(s=int64#6d +# asm 2: movl 416(s=%r9d +movl 416(%rsi),%r9d + +# qhasm: e = *(uint32 *)(input_2 + 572) +# asm 1: movl 572(e=int64#7d +# asm 2: movl 572(e=%eax +movl 572(%rdx),%eax + +# qhasm: s &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 16(b64=%rax +movq 16(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 24(b64=%rax +movq 24(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(ee=reg256#2 +# asm 2: vmovupd 0(ee=%ymm1 +vmovupd 0(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 32(ss=%ymm0 +vmovupd 32(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(ee=reg256#2 +# asm 2: vmovupd 32(ee=%ymm1 +vmovupd 32(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 64(ss=%ymm0 +vmovupd 64(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(ee=reg256#2 +# asm 2: vmovupd 64(ee=%ymm1 +vmovupd 64(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 96(ss=%ymm0 +vmovupd 96(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 96 ] +# asm 1: vmovupd 96(ee=reg256#2 +# asm 2: vmovupd 96(ee=%ymm1 +vmovupd 96(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor s=int64#2 +# asm 2: movq 128(s=%rsi +movq 128(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 128 ] +# asm 1: movq 128(e=int64#4 +# asm 2: movq 128(e=%rcx +movq 128(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movq 136(s=%rsi +movq 136(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 136 ] +# asm 1: movq 136(e=int64#4 +# asm 2: movq 136(e=%rcx +movq 136(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movq 144(s=%rsi +movq 144(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 144 ] +# asm 1: movq 144(e=int64#4 +# asm 2: movq 144(e=%rcx +movq 144(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2d +# asm 2: movl 152(s=%esi +movl 152(%rdi),%esi + +# qhasm: e = *(uint32 *)( input_2 + 152 ) +# asm 1: movl 152(e=int64#3d +# asm 2: movl 152(e=%edx +movl 152(%rdx),%edx + +# qhasm: s ^= e +# asm 1: xor mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE460896_AVX_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE460896_AVX_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE460896_AVX_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_AVX_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE460896_AVX_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE460896_AVX_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE460896_AVX_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_AVX_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE460896_AVX_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE460896_AVX_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE460896_AVX_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_AVX_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE460896_AVX_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE460896_AVX_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE460896_AVX_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_AVX_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE460896_AVX_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE460896_AVX_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE460896_AVX_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_AVX_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE460896_AVX_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE460896_AVX_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE460896_AVX_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE460896_AVX_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE460896_AVX_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_AVX_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE460896_AVX_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE460896_AVX_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE460896_AVX_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_AVX_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE460896_AVX_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE460896_AVX_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE460896_AVX_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_AVX_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE460896_AVX_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE460896_AVX_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE460896_AVX_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_AVX_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE460896_AVX_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE460896_AVX_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE460896_AVX_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896_AVX_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE460896_AVX_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE460896_AVX_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK5_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE460896_AVX_MASK5_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE460896_AVX_MASK5_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK5_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK5_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE460896_AVX_MASK5_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE460896_AVX_MASK4_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK4_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK4_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE460896_AVX_MASK4_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE460896_AVX_MASK4_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK4_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK4_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE460896_AVX_MASK4_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE460896_AVX_MASK3_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK3_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK3_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE460896_AVX_MASK3_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE460896_AVX_MASK3_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK3_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK3_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE460896_AVX_MASK3_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(x1=reg256#8 +# asm 2: vmovupd 256(x1=%ymm7 +vmovupd 256(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 512 ] +# asm 1: vmovupd 512(x2=reg256#9 +# asm 2: vmovupd 512(x2=%ymm8 +vmovupd 512(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 768 ] +# asm 1: vmovupd 768(x3=reg256#10 +# asm 2: vmovupd 768(x3=%ymm9 +vmovupd 768(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1024 ] +# asm 1: vmovupd 1024(x4=reg256#11 +# asm 2: vmovupd 1024(x4=%ymm10 +vmovupd 1024(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1280 ] +# asm 1: vmovupd 1280(x5=reg256#12 +# asm 2: vmovupd 1280(x5=%ymm11 +vmovupd 1280(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1536 ] +# asm 1: vmovupd 1536(x6=reg256#13 +# asm 2: vmovupd 1536(x6=%ymm12 +vmovupd 1536(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1792 ] +# asm 1: vmovupd 1792(x7=reg256#14 +# asm 2: vmovupd 1792(x7=%ymm13 +vmovupd 1792(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 32(x0=%ymm6 +vmovupd 32(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x2=reg256#9 +# asm 2: vmovupd 544(x2=%ymm8 +vmovupd 544(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x3=reg256#10 +# asm 2: vmovupd 800(x3=%ymm9 +vmovupd 800(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x4=reg256#11 +# asm 2: vmovupd 1056(x4=%ymm10 +vmovupd 1056(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x5=reg256#12 +# asm 2: vmovupd 1312(x5=%ymm11 +vmovupd 1312(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x6=reg256#13 +# asm 2: vmovupd 1568(x6=%ymm12 +vmovupd 1568(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x7=reg256#14 +# asm 2: vmovupd 1824(x7=%ymm13 +vmovupd 1824(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 32 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 64(x0=%ymm6 +vmovupd 64(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x1=reg256#8 +# asm 2: vmovupd 320(x1=%ymm7 +vmovupd 320(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x3=reg256#10 +# asm 2: vmovupd 832(x3=%ymm9 +vmovupd 832(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x4=reg256#11 +# asm 2: vmovupd 1088(x4=%ymm10 +vmovupd 1088(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x5=reg256#12 +# asm 2: vmovupd 1344(x5=%ymm11 +vmovupd 1344(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x6=reg256#13 +# asm 2: vmovupd 1600(x6=%ymm12 +vmovupd 1600(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x7=reg256#14 +# asm 2: vmovupd 1856(x7=%ymm13 +vmovupd 1856(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 64 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 96(x0=%ymm6 +vmovupd 96(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x1=reg256#8 +# asm 2: vmovupd 352(x1=%ymm7 +vmovupd 352(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x2=reg256#9 +# asm 2: vmovupd 608(x2=%ymm8 +vmovupd 608(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x4=reg256#11 +# asm 2: vmovupd 1120(x4=%ymm10 +vmovupd 1120(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x5=reg256#12 +# asm 2: vmovupd 1376(x5=%ymm11 +vmovupd 1376(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x6=reg256#13 +# asm 2: vmovupd 1632(x6=%ymm12 +vmovupd 1632(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x7=reg256#14 +# asm 2: vmovupd 1888(x7=%ymm13 +vmovupd 1888(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 96 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 128(x0=%ymm6 +vmovupd 128(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x1=reg256#8 +# asm 2: vmovupd 384(x1=%ymm7 +vmovupd 384(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x2=reg256#9 +# asm 2: vmovupd 640(x2=%ymm8 +vmovupd 640(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x3=reg256#10 +# asm 2: vmovupd 896(x3=%ymm9 +vmovupd 896(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x5=reg256#12 +# asm 2: vmovupd 1408(x5=%ymm11 +vmovupd 1408(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x6=reg256#13 +# asm 2: vmovupd 1664(x6=%ymm12 +vmovupd 1664(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x7=reg256#14 +# asm 2: vmovupd 1920(x7=%ymm13 +vmovupd 1920(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 128 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 160(x0=%ymm6 +vmovupd 160(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x1=reg256#8 +# asm 2: vmovupd 416(x1=%ymm7 +vmovupd 416(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x2=reg256#9 +# asm 2: vmovupd 672(x2=%ymm8 +vmovupd 672(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x3=reg256#10 +# asm 2: vmovupd 928(x3=%ymm9 +vmovupd 928(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x4=reg256#11 +# asm 2: vmovupd 1184(x4=%ymm10 +vmovupd 1184(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x6=reg256#13 +# asm 2: vmovupd 1696(x6=%ymm12 +vmovupd 1696(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x7=reg256#14 +# asm 2: vmovupd 1952(x7=%ymm13 +vmovupd 1952(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 160 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 192(x0=%ymm6 +vmovupd 192(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x1=reg256#8 +# asm 2: vmovupd 448(x1=%ymm7 +vmovupd 448(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x2=reg256#9 +# asm 2: vmovupd 704(x2=%ymm8 +vmovupd 704(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x3=reg256#10 +# asm 2: vmovupd 960(x3=%ymm9 +vmovupd 960(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x4=reg256#11 +# asm 2: vmovupd 1216(x4=%ymm10 +vmovupd 1216(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x5=reg256#12 +# asm 2: vmovupd 1472(x5=%ymm11 +vmovupd 1472(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x7=reg256#14 +# asm 2: vmovupd 1984(x7=%ymm13 +vmovupd 1984(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 192 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 224(x0=%ymm6 +vmovupd 224(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x1=reg256#8 +# asm 2: vmovupd 480(x1=%ymm7 +vmovupd 480(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x2=reg256#9 +# asm 2: vmovupd 736(x2=%ymm8 +vmovupd 736(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x3=reg256#10 +# asm 2: vmovupd 992(x3=%ymm9 +vmovupd 992(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x4=reg256#11 +# asm 2: vmovupd 1248(x4=%ymm10 +vmovupd 1248(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x5=reg256#12 +# asm 2: vmovupd 1504(x5=%ymm11 +vmovupd 1504(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x6=reg256#13 +# asm 2: vmovupd 1760(x6=%ymm12 +vmovupd 1760(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#1 +# asm 2: vpand v00=%ymm0 +vpand %ymm9,%ymm0,%ymm0 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#13 +# asm 2: vpsllq $32,v10=%ymm12 +vpsllq $32,%ymm13,%ymm12 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm0,%ymm12,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#13 +# asm 2: vpslld $16,v10=%ymm12 +vpslld $16,%ymm11,%ymm12 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#14 +# asm 2: vpsrld $16,v01=%ymm13 +vpsrld $16,%ymm14,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm0,%ymm13 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm8,%ymm13 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#3 +# asm 2: vpand v00=%ymm2 +vpand %ymm7,%ymm2,%ymm2 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#9 +# asm 2: vpslld $16,v10=%ymm8 +vpslld $16,%ymm1,%ymm8 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm2,%ymm8,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#8 +# asm 2: vpsllw $8,v10=%ymm7 +vpsllw $8,%ymm12,%ymm7 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#9 +# asm 2: vpsrlw $8,v01=%ymm8 +vpsrlw $8,%ymm9,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#10 +# asm 2: vpsllw $8,v10=%ymm9 +vpsllw $8,%ymm0,%ymm9 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#12 +# asm 2: vpsllw $8,v10=%ymm11 +vpsllw $8,%ymm2,%ymm11 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#11 +# asm 2: vpsrlw $8,v01=%ymm10 +vpsrlw $8,%ymm10,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#5 +# asm 2: vpand v00=%ymm4 +vpand %ymm6,%ymm4,%ymm4 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#11 +# asm 2: vpsllw $8,v10=%ymm10 +vpsllw $8,%ymm1,%ymm10 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm4,%ymm10,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 224 ] = x0 +# asm 1: vmovupd mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK2_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE460896_AVX_MASK2_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE460896_AVX_MASK2_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK2_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK2_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE460896_AVX_MASK2_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE460896_AVX_MASK1_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK1_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK1_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE460896_AVX_MASK1_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE460896_AVX_MASK1_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK1_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK1_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE460896_AVX_MASK1_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE460896_AVX_MASK0_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK0_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK0_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE460896_AVX_MASK0_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE460896_AVX_MASK0_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK0_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE460896_AVX_MASK0_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE460896_AVX_MASK0_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(x1=reg256#8 +# asm 2: vmovupd 32(x1=%ymm7 +vmovupd 32(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(x2=reg256#9 +# asm 2: vmovupd 64(x2=%ymm8 +vmovupd 64(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(x3=reg256#10 +# asm 2: vmovupd 96(x3=%ymm9 +vmovupd 96(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(x4=reg256#11 +# asm 2: vmovupd 128(x4=%ymm10 +vmovupd 128(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(x5=reg256#12 +# asm 2: vmovupd 160(x5=%ymm11 +vmovupd 160(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(x6=reg256#13 +# asm 2: vmovupd 192(x6=%ymm12 +vmovupd 192(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(x7=reg256#14 +# asm 2: vmovupd 224(x7=%ymm13 +vmovupd 224(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 256(x0=%ymm6 +vmovupd 256(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x2=reg256#9 +# asm 2: vmovupd 320(x2=%ymm8 +vmovupd 320(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x3=reg256#10 +# asm 2: vmovupd 352(x3=%ymm9 +vmovupd 352(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x4=reg256#11 +# asm 2: vmovupd 384(x4=%ymm10 +vmovupd 384(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x5=reg256#12 +# asm 2: vmovupd 416(x5=%ymm11 +vmovupd 416(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x6=reg256#13 +# asm 2: vmovupd 448(x6=%ymm12 +vmovupd 448(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x7=reg256#14 +# asm 2: vmovupd 480(x7=%ymm13 +vmovupd 480(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 256 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 512(x0=%ymm6 +vmovupd 512(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x1=reg256#8 +# asm 2: vmovupd 544(x1=%ymm7 +vmovupd 544(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x3=reg256#10 +# asm 2: vmovupd 608(x3=%ymm9 +vmovupd 608(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x4=reg256#11 +# asm 2: vmovupd 640(x4=%ymm10 +vmovupd 640(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x5=reg256#12 +# asm 2: vmovupd 672(x5=%ymm11 +vmovupd 672(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x6=reg256#13 +# asm 2: vmovupd 704(x6=%ymm12 +vmovupd 704(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x7=reg256#14 +# asm 2: vmovupd 736(x7=%ymm13 +vmovupd 736(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 512 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 768(x0=%ymm6 +vmovupd 768(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x1=reg256#8 +# asm 2: vmovupd 800(x1=%ymm7 +vmovupd 800(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x2=reg256#9 +# asm 2: vmovupd 832(x2=%ymm8 +vmovupd 832(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x4=reg256#11 +# asm 2: vmovupd 896(x4=%ymm10 +vmovupd 896(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x5=reg256#12 +# asm 2: vmovupd 928(x5=%ymm11 +vmovupd 928(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x6=reg256#13 +# asm 2: vmovupd 960(x6=%ymm12 +vmovupd 960(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x7=reg256#14 +# asm 2: vmovupd 992(x7=%ymm13 +vmovupd 992(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 768 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1024(x0=%ymm6 +vmovupd 1024(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x1=reg256#8 +# asm 2: vmovupd 1056(x1=%ymm7 +vmovupd 1056(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x2=reg256#9 +# asm 2: vmovupd 1088(x2=%ymm8 +vmovupd 1088(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x3=reg256#10 +# asm 2: vmovupd 1120(x3=%ymm9 +vmovupd 1120(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x5=reg256#12 +# asm 2: vmovupd 1184(x5=%ymm11 +vmovupd 1184(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x6=reg256#13 +# asm 2: vmovupd 1216(x6=%ymm12 +vmovupd 1216(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x7=reg256#14 +# asm 2: vmovupd 1248(x7=%ymm13 +vmovupd 1248(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1024 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1280(x0=%ymm6 +vmovupd 1280(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x1=reg256#8 +# asm 2: vmovupd 1312(x1=%ymm7 +vmovupd 1312(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x2=reg256#9 +# asm 2: vmovupd 1344(x2=%ymm8 +vmovupd 1344(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x3=reg256#10 +# asm 2: vmovupd 1376(x3=%ymm9 +vmovupd 1376(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x4=reg256#11 +# asm 2: vmovupd 1408(x4=%ymm10 +vmovupd 1408(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x6=reg256#13 +# asm 2: vmovupd 1472(x6=%ymm12 +vmovupd 1472(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x7=reg256#14 +# asm 2: vmovupd 1504(x7=%ymm13 +vmovupd 1504(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1280 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1536(x0=%ymm6 +vmovupd 1536(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x1=reg256#8 +# asm 2: vmovupd 1568(x1=%ymm7 +vmovupd 1568(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x2=reg256#9 +# asm 2: vmovupd 1600(x2=%ymm8 +vmovupd 1600(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x3=reg256#10 +# asm 2: vmovupd 1632(x3=%ymm9 +vmovupd 1632(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x4=reg256#11 +# asm 2: vmovupd 1664(x4=%ymm10 +vmovupd 1664(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x5=reg256#12 +# asm 2: vmovupd 1696(x5=%ymm11 +vmovupd 1696(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x7=reg256#14 +# asm 2: vmovupd 1760(x7=%ymm13 +vmovupd 1760(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1536 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1792(x0=%ymm6 +vmovupd 1792(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x1=reg256#8 +# asm 2: vmovupd 1824(x1=%ymm7 +vmovupd 1824(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x2=reg256#9 +# asm 2: vmovupd 1856(x2=%ymm8 +vmovupd 1856(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x3=reg256#10 +# asm 2: vmovupd 1888(x3=%ymm9 +vmovupd 1888(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x4=reg256#11 +# asm 2: vmovupd 1920(x4=%ymm10 +vmovupd 1920(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x5=reg256#12 +# asm 2: vmovupd 1952(x5=%ymm11 +vmovupd 1952(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x6=reg256#13 +# asm 2: vmovupd 1984(x6=%ymm12 +vmovupd 1984(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#1 +# asm 2: vpand v10=%ymm0 +vpand %ymm13,%ymm0,%ymm0 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm12,%ymm0,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#13 +# asm 2: vpand v10=%ymm12 +vpand %ymm11,%ymm2,%ymm12 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm14,%ymm3,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm0,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm8,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#3 +# asm 2: vpand v10=%ymm2 +vpand %ymm1,%ymm2,%ymm2 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm8,%ymm2,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#8 +# asm 2: vpand v10=%ymm7 +vpand %ymm12,%ymm4,%ymm7 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm9,%ymm5,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#10 +# asm 2: vpand v10=%ymm9 +vpand %ymm0,%ymm4,%ymm9 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#12 +# asm 2: vpand v10=%ymm11 +vpand %ymm2,%ymm4,%ymm11 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm4,%ymm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#5 +# asm 2: vpand v10=%ymm4 +vpand %ymm1,%ymm4,%ymm4 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm10,%ymm4,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 1792 ] = x0 +# asm 1: vmovupd +#include + +void PQCLEAN_MCELIECE460896_AVX_uint32_sort(uint32_t *x, size_t n); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/update_asm.S b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/update_asm.S new file mode 100644 index 0000000000..33046f9d2a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/update_asm.S @@ -0,0 +1,576 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 s0 + +# qhasm: int64 s1 + +# qhasm: int64 s2 + +# qhasm: enter update_asm +.p2align 5 +.global _PQCLEAN_MCELIECE460896_AVX_update_asm +.global PQCLEAN_MCELIECE460896_AVX_update_asm +_PQCLEAN_MCELIECE460896_AVX_update_asm: +PQCLEAN_MCELIECE460896_AVX_update_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: s2 = input_1 +# asm 1: mov s2=int64#2 +# asm 2: mov s2=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1, + +void PQCLEAN_MCELIECE460896_AVX_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE460896_AVX_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE460896_AVX_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE460896_AVX_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE460896_AVX_irr_load(vec128 *out, const unsigned char *in) { + int i, j; + uint64_t v[2]; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE460896_AVX_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + v[0] = v[1] = 0; + + for (j = 63; j >= 0; j--) { + v[0] <<= 1; + v[0] |= (irr[j] >> i) & 1; + } + for (j = SYS_T; j >= 64; j--) { + v[1] <<= 1; + v[1] |= (irr[j] >> i) & 1; + } + + out[i] = PQCLEAN_MCELIECE460896_AVX_vec128_set2x(v[0], v[1]); + } +} + +void PQCLEAN_MCELIECE460896_AVX_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE460896_AVX_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +vec128 PQCLEAN_MCELIECE460896_AVX_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE460896_AVX_vec128_set2x( PQCLEAN_MCELIECE460896_AVX_load8(in), PQCLEAN_MCELIECE460896_AVX_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE460896_AVX_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE460896_AVX_store8(out + 0, PQCLEAN_MCELIECE460896_AVX_vec128_extract(in, 0)); + PQCLEAN_MCELIECE460896_AVX_store8(out + 8, PQCLEAN_MCELIECE460896_AVX_vec128_extract(in, 1)); +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/util.h b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/util.h new file mode 100644 index 0000000000..6a01bd942d --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/util.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_UTIL_H +#define PQCLEAN_MCELIECE460896_AVX_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE460896_AVX_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE460896_AVX_store2(unsigned char *dest, uint16_t a); +uint16_t PQCLEAN_MCELIECE460896_AVX_load2(const unsigned char *src); +uint32_t PQCLEAN_MCELIECE460896_AVX_load4(const unsigned char *src); +void PQCLEAN_MCELIECE460896_AVX_irr_load(vec128 *out, const unsigned char *in); +void PQCLEAN_MCELIECE460896_AVX_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE460896_AVX_load8(const unsigned char *in); +vec128 PQCLEAN_MCELIECE460896_AVX_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE460896_AVX_store16(unsigned char *out, vec128 in); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/vec128.c b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/vec128.c new file mode 100644 index 0000000000..1765598fba --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/vec128.c @@ -0,0 +1,83 @@ +#include "vec128.h" + +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +int PQCLEAN_MCELIECE460896_AVX_vec128_testz(vec128 a) { + return _mm_testz_si128(a, a); +} + +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE460896_AVX_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE460896_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE460896_AVX_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE460896_AVX_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE460896_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE460896_AVX_vec128_mul_asm(h, f, g, 16); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/vec128.h b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/vec128.h new file mode 100644 index 0000000000..cb6fa2b24a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/vec128.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_MCELIECE460896_AVX_VEC128_H +#define PQCLEAN_MCELIECE460896_AVX_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE460896_AVX_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE460896_AVX_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE460896_AVX_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE460896_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE460896_AVX_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE460896_AVX_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE460896_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/vec128_mul_asm.S b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/vec128_mul_asm.S new file mode 100644 index 0000000000..7387f125dd --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/vec128_mul_asm.S @@ -0,0 +1,1816 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 b2 + +# qhasm: reg256 b3 + +# qhasm: reg256 b4 + +# qhasm: reg256 b5 + +# qhasm: reg256 b6 + +# qhasm: reg256 b7 + +# qhasm: reg256 b8 + +# qhasm: reg256 b9 + +# qhasm: reg256 b10 + +# qhasm: reg256 b11 + +# qhasm: reg256 b12 + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: reg128 h0 + +# qhasm: reg128 h1 + +# qhasm: reg128 h2 + +# qhasm: reg128 h3 + +# qhasm: reg128 h4 + +# qhasm: reg128 h5 + +# qhasm: reg128 h6 + +# qhasm: reg128 h7 + +# qhasm: reg128 h8 + +# qhasm: reg128 h9 + +# qhasm: reg128 h10 + +# qhasm: reg128 h11 + +# qhasm: reg128 h12 + +# qhasm: reg128 h13 + +# qhasm: reg128 h14 + +# qhasm: reg128 h15 + +# qhasm: reg128 h16 + +# qhasm: reg128 h17 + +# qhasm: reg128 h18 + +# qhasm: reg128 h19 + +# qhasm: reg128 h20 + +# qhasm: reg128 h21 + +# qhasm: reg128 h22 + +# qhasm: reg128 h23 + +# qhasm: reg128 h24 + +# qhasm: stack4864 buf + +# qhasm: int64 ptr + +# qhasm: int64 tmp + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE460896_AVX_vec128_mul_asm +.global PQCLEAN_MCELIECE460896_AVX_vec128_mul_asm +_PQCLEAN_MCELIECE460896_AVX_vec128_mul_asm: +PQCLEAN_MCELIECE460896_AVX_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $608,%r11 +sub %r11,%rsp + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#5 +# asm 2: leaq ptr=%r8 +leaq 0(%rsp),%r8 + +# qhasm: tmp = input_3 +# asm 1: mov tmp=int64#6 +# asm 2: mov tmp=%r9 +mov %rcx,%r9 + +# qhasm: tmp *= 12 +# asm 1: imulq $12,tmp=int64#6 +# asm 2: imulq $12,tmp=%r9 +imulq $12,%r9,%r9 + +# qhasm: input_2 += tmp +# asm 1: add b12=reg256#1 +# asm 2: vbroadcasti128 0(b12=%ymm0 +vbroadcasti128 0(%rdx), %ymm0 + +# qhasm: input_2 -= input_3 +# asm 1: sub a6=reg256#2 +# asm 2: vpxor a6=%ymm1 +vpxor %ymm1,%ymm1,%ymm1 + +# qhasm: a6[0] = mem128[ input_1 + 96 ] +# asm 1: vinsertf128 $0x0,96(r18=reg256#3 +# asm 2: vpand r18=%ymm2 +vpand %ymm0,%ymm1,%ymm2 + +# qhasm: mem256[ ptr + 576 ] = r18 +# asm 1: vmovupd r17=reg256#4 +# asm 2: vpand r17=%ymm3 +vpand %ymm0,%ymm2,%ymm3 + +# qhasm: a4[0] = mem128[ input_1 + 64 ] +# asm 1: vinsertf128 $0x0,64(r16=reg256#6 +# asm 2: vpand r16=%ymm5 +vpand %ymm0,%ymm4,%ymm5 + +# qhasm: a3[0] = mem128[ input_1 + 48 ] +# asm 1: vinsertf128 $0x0,48(r15=reg256#8 +# asm 2: vpand r15=%ymm7 +vpand %ymm0,%ymm6,%ymm7 + +# qhasm: a2[0] = mem128[ input_1 + 32 ] +# asm 1: vinsertf128 $0x0,32(r14=reg256#10 +# asm 2: vpand r14=%ymm9 +vpand %ymm0,%ymm8,%ymm9 + +# qhasm: a1[0] = mem128[ input_1 + 16 ] +# asm 1: vinsertf128 $0x0,16(r13=reg256#12 +# asm 2: vpand r13=%ymm11 +vpand %ymm0,%ymm10,%ymm11 + +# qhasm: a0[0] = mem128[ input_1 + 0 ] +# asm 1: vinsertf128 $0x0,0(r12=reg256#1 +# asm 2: vpand r12=%ymm0 +vpand %ymm0,%ymm12,%ymm0 + +# qhasm: b11 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b11=reg256#14 +# asm 2: vbroadcasti128 0(b11=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r12 ^= r +# asm 1: vpxor r11=reg256#4 +# asm 2: vpand r11=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b10 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b10=reg256#14 +# asm 2: vbroadcasti128 0(b10=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r11 ^= r +# asm 1: vpxor r10=reg256#6 +# asm 2: vpand r10=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b9 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b9=reg256#14 +# asm 2: vbroadcasti128 0(b9=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r10 ^= r +# asm 1: vpxor r9=reg256#8 +# asm 2: vpand r9=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b8 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b8=reg256#14 +# asm 2: vbroadcasti128 0(b8=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r9 ^= r +# asm 1: vpxor r8=reg256#10 +# asm 2: vpand r8=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b7 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b7=reg256#14 +# asm 2: vbroadcasti128 0(b7=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r8 ^= r +# asm 1: vpxor r7=reg256#12 +# asm 2: vpand r7=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b6 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b6=reg256#14 +# asm 2: vbroadcasti128 0(b6=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r6=reg256#1 +# asm 2: vpand r6=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: b5 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b5=reg256#14 +# asm 2: vbroadcasti128 0(b5=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r6 ^= r +# asm 1: vpxor r5=reg256#4 +# asm 2: vpand r5=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b4 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b4=reg256#14 +# asm 2: vbroadcasti128 0(b4=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#6 +# asm 2: vpand r4=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b3 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b3=reg256#14 +# asm 2: vbroadcasti128 0(b3=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#8 +# asm 2: vpand r3=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b2 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b2=reg256#14 +# asm 2: vbroadcasti128 0(b2=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#10 +# asm 2: vpand r2=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b1 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b1=reg256#14 +# asm 2: vbroadcasti128 0(b1=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#12 +# asm 2: vpand r1=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b0 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b0=reg256#14 +# asm 2: vbroadcasti128 0(b0=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#1 +# asm 2: vpand r0=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: mem256[ ptr + 160 ] = r5 +# asm 1: vmovupd h24=reg128#1 +# asm 2: movdqu 560(h24=%xmm0 +movdqu 560(%r8),%xmm0 + +# qhasm: h11 = h24 +# asm 1: movdqa h11=reg128#2 +# asm 2: movdqa h11=%xmm1 +movdqa %xmm0,%xmm1 + +# qhasm: h12 = h24 +# asm 1: movdqa h12=reg128#3 +# asm 2: movdqa h12=%xmm2 +movdqa %xmm0,%xmm2 + +# qhasm: h14 = h24 +# asm 1: movdqa h14=reg128#4 +# asm 2: movdqa h14=%xmm3 +movdqa %xmm0,%xmm3 + +# qhasm: h15 = h24 +# asm 1: movdqa h15=reg128#1 +# asm 2: movdqa h15=%xmm0 +movdqa %xmm0,%xmm0 + +# qhasm: h23 = mem128[ ptr + 528 ] +# asm 1: movdqu 528(h23=reg128#5 +# asm 2: movdqu 528(h23=%xmm4 +movdqu 528(%r8),%xmm4 + +# qhasm: h10 = h23 +# asm 1: movdqa h10=reg128#6 +# asm 2: movdqa h10=%xmm5 +movdqa %xmm4,%xmm5 + +# qhasm: h11 = h11 ^ h23 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm4,%xmm1,%xmm1 + +# qhasm: h13 = h23 +# asm 1: movdqa h13=reg128#7 +# asm 2: movdqa h13=%xmm6 +movdqa %xmm4,%xmm6 + +# qhasm: h14 = h14 ^ h23 +# asm 1: vpxor h14=reg128#4 +# asm 2: vpxor h14=%xmm3 +vpxor %xmm4,%xmm3,%xmm3 + +# qhasm: h22 = mem128[ ptr + 496 ] +# asm 1: movdqu 496(h22=reg128#5 +# asm 2: movdqu 496(h22=%xmm4 +movdqu 496(%r8),%xmm4 + +# qhasm: h9 = h22 +# asm 1: movdqa h9=reg128#8 +# asm 2: movdqa h9=%xmm7 +movdqa %xmm4,%xmm7 + +# qhasm: h10 = h10 ^ h22 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm4,%xmm5,%xmm5 + +# qhasm: h12 = h12 ^ h22 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm4,%xmm2,%xmm2 + +# qhasm: h13 = h13 ^ h22 +# asm 1: vpxor h13=reg128#5 +# asm 2: vpxor h13=%xmm4 +vpxor %xmm4,%xmm6,%xmm4 + +# qhasm: h21 = mem128[ ptr + 464 ] +# asm 1: movdqu 464(h21=reg128#7 +# asm 2: movdqu 464(h21=%xmm6 +movdqu 464(%r8),%xmm6 + +# qhasm: h8 = h21 +# asm 1: movdqa h8=reg128#9 +# asm 2: movdqa h8=%xmm8 +movdqa %xmm6,%xmm8 + +# qhasm: h9 = h9 ^ h21 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h11 = h11 ^ h21 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h12 = h12 ^ h21 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm6,%xmm2,%xmm2 + +# qhasm: h20 = mem128[ ptr + 432 ] +# asm 1: movdqu 432(h20=reg128#7 +# asm 2: movdqu 432(h20=%xmm6 +movdqu 432(%r8),%xmm6 + +# qhasm: h7 = h20 +# asm 1: movdqa h7=reg128#10 +# asm 2: movdqa h7=%xmm9 +movdqa %xmm6,%xmm9 + +# qhasm: h8 = h8 ^ h20 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h10 = h10 ^ h20 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h11 = h11 ^ h20 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h19 = mem128[ ptr + 400 ] +# asm 1: movdqu 400(h19=reg128#7 +# asm 2: movdqu 400(h19=%xmm6 +movdqu 400(%r8),%xmm6 + +# qhasm: h6 = h19 +# asm 1: movdqa h6=reg128#11 +# asm 2: movdqa h6=%xmm10 +movdqa %xmm6,%xmm10 + +# qhasm: h7 = h7 ^ h19 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm6,%xmm9,%xmm9 + +# qhasm: h9 = h9 ^ h19 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h10 = h10 ^ h19 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h18 = mem128[ ptr + 368 ] +# asm 1: movdqu 368(h18=reg128#7 +# asm 2: movdqu 368(h18=%xmm6 +movdqu 368(%r8),%xmm6 + +# qhasm: h18 = h18 ^ mem128[ ptr + 576 ] +# asm 1: vpxor 576(h18=reg128#7 +# asm 2: vpxor 576(h18=%xmm6 +vpxor 576(%r8),%xmm6,%xmm6 + +# qhasm: h5 = h18 +# asm 1: movdqa h5=reg128#12 +# asm 2: movdqa h5=%xmm11 +movdqa %xmm6,%xmm11 + +# qhasm: h6 = h6 ^ h18 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm6,%xmm10,%xmm10 + +# qhasm: h8 = h8 ^ h18 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h9 = h9 ^ h18 +# asm 1: vpxor h9=reg128#7 +# asm 2: vpxor h9=%xmm6 +vpxor %xmm6,%xmm7,%xmm6 + +# qhasm: h17 = mem128[ ptr + 336 ] +# asm 1: movdqu 336(h17=reg128#8 +# asm 2: movdqu 336(h17=%xmm7 +movdqu 336(%r8),%xmm7 + +# qhasm: h17 = h17 ^ mem128[ ptr + 544 ] +# asm 1: vpxor 544(h17=reg128#8 +# asm 2: vpxor 544(h17=%xmm7 +vpxor 544(%r8),%xmm7,%xmm7 + +# qhasm: h4 = h17 +# asm 1: movdqa h4=reg128#13 +# asm 2: movdqa h4=%xmm12 +movdqa %xmm7,%xmm12 + +# qhasm: h5 = h5 ^ h17 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm7,%xmm11,%xmm11 + +# qhasm: h7 = h7 ^ h17 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm7,%xmm9,%xmm9 + +# qhasm: h8 = h8 ^ h17 +# asm 1: vpxor h8=reg128#8 +# asm 2: vpxor h8=%xmm7 +vpxor %xmm7,%xmm8,%xmm7 + +# qhasm: h16 = mem128[ ptr + 304 ] +# asm 1: movdqu 304(h16=reg128#9 +# asm 2: movdqu 304(h16=%xmm8 +movdqu 304(%r8),%xmm8 + +# qhasm: h16 = h16 ^ mem128[ ptr + 512 ] +# asm 1: vpxor 512(h16=reg128#9 +# asm 2: vpxor 512(h16=%xmm8 +vpxor 512(%r8),%xmm8,%xmm8 + +# qhasm: h3 = h16 +# asm 1: movdqa h3=reg128#14 +# asm 2: movdqa h3=%xmm13 +movdqa %xmm8,%xmm13 + +# qhasm: h4 = h4 ^ h16 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm8,%xmm12,%xmm12 + +# qhasm: h6 = h6 ^ h16 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm8,%xmm10,%xmm10 + +# qhasm: h7 = h7 ^ h16 +# asm 1: vpxor h7=reg128#9 +# asm 2: vpxor h7=%xmm8 +vpxor %xmm8,%xmm9,%xmm8 + +# qhasm: h15 = h15 ^ mem128[ ptr + 272 ] +# asm 1: vpxor 272(h15=reg128#1 +# asm 2: vpxor 272(h15=%xmm0 +vpxor 272(%r8),%xmm0,%xmm0 + +# qhasm: h15 = h15 ^ mem128[ ptr + 480 ] +# asm 1: vpxor 480(h15=reg128#1 +# asm 2: vpxor 480(h15=%xmm0 +vpxor 480(%r8),%xmm0,%xmm0 + +# qhasm: h2 = h15 +# asm 1: movdqa h2=reg128#10 +# asm 2: movdqa h2=%xmm9 +movdqa %xmm0,%xmm9 + +# qhasm: h3 = h3 ^ h15 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm0,%xmm13,%xmm13 + +# qhasm: h5 = h5 ^ h15 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm0,%xmm11,%xmm11 + +# qhasm: h6 = h6 ^ h15 +# asm 1: vpxor h6=reg128#1 +# asm 2: vpxor h6=%xmm0 +vpxor %xmm0,%xmm10,%xmm0 + +# qhasm: h14 = h14 ^ mem128[ ptr + 240 ] +# asm 1: vpxor 240(h14=reg128#4 +# asm 2: vpxor 240(h14=%xmm3 +vpxor 240(%r8),%xmm3,%xmm3 + +# qhasm: h14 = h14 ^ mem128[ ptr + 448 ] +# asm 1: vpxor 448(h14=reg128#4 +# asm 2: vpxor 448(h14=%xmm3 +vpxor 448(%r8),%xmm3,%xmm3 + +# qhasm: h1 = h14 +# asm 1: movdqa h1=reg128#11 +# asm 2: movdqa h1=%xmm10 +movdqa %xmm3,%xmm10 + +# qhasm: h2 = h2 ^ h14 +# asm 1: vpxor h2=reg128#10 +# asm 2: vpxor h2=%xmm9 +vpxor %xmm3,%xmm9,%xmm9 + +# qhasm: h4 = h4 ^ h14 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm3,%xmm12,%xmm12 + +# qhasm: h5 = h5 ^ h14 +# asm 1: vpxor h5=reg128#4 +# asm 2: vpxor h5=%xmm3 +vpxor %xmm3,%xmm11,%xmm3 + +# qhasm: h13 = h13 ^ mem128[ ptr + 208 ] +# asm 1: vpxor 208(h13=reg128#5 +# asm 2: vpxor 208(h13=%xmm4 +vpxor 208(%r8),%xmm4,%xmm4 + +# qhasm: h13 = h13 ^ mem128[ ptr + 416 ] +# asm 1: vpxor 416(h13=reg128#5 +# asm 2: vpxor 416(h13=%xmm4 +vpxor 416(%r8),%xmm4,%xmm4 + +# qhasm: h0 = h13 +# asm 1: movdqa h0=reg128#12 +# asm 2: movdqa h0=%xmm11 +movdqa %xmm4,%xmm11 + +# qhasm: h1 = h1 ^ h13 +# asm 1: vpxor h1=reg128#11 +# asm 2: vpxor h1=%xmm10 +vpxor %xmm4,%xmm10,%xmm10 + +# qhasm: h3 = h3 ^ h13 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm4,%xmm13,%xmm13 + +# qhasm: h4 = h4 ^ h13 +# asm 1: vpxor h4=reg128#5 +# asm 2: vpxor h4=%xmm4 +vpxor %xmm4,%xmm12,%xmm4 + +# qhasm: h12 = h12 ^ mem128[ ptr + 384 ] +# asm 1: vpxor 384(h12=reg128#3 +# asm 2: vpxor 384(h12=%xmm2 +vpxor 384(%r8),%xmm2,%xmm2 + +# qhasm: h12 = h12 ^ mem128[ ptr + 176 ] +# asm 1: vpxor 176(h12=reg128#3 +# asm 2: vpxor 176(h12=%xmm2 +vpxor 176(%r8),%xmm2,%xmm2 + +# qhasm: mem128[ input_0 + 192 ] = h12 +# asm 1: movdqu h11=reg128#2 +# asm 2: vpxor 352(h11=%xmm1 +vpxor 352(%r8),%xmm1,%xmm1 + +# qhasm: h11 = h11 ^ mem128[ ptr + 144 ] +# asm 1: vpxor 144(h11=reg128#2 +# asm 2: vpxor 144(h11=%xmm1 +vpxor 144(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 176 ] = h11 +# asm 1: movdqu h10=reg128#2 +# asm 2: vpxor 320(h10=%xmm1 +vpxor 320(%r8),%xmm5,%xmm1 + +# qhasm: h10 = h10 ^ mem128[ ptr + 112 ] +# asm 1: vpxor 112(h10=reg128#2 +# asm 2: vpxor 112(h10=%xmm1 +vpxor 112(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 160 ] = h10 +# asm 1: movdqu h9=reg128#2 +# asm 2: vpxor 288(h9=%xmm1 +vpxor 288(%r8),%xmm6,%xmm1 + +# qhasm: h9 = h9 ^ mem128[ ptr + 80 ] +# asm 1: vpxor 80(h9=reg128#2 +# asm 2: vpxor 80(h9=%xmm1 +vpxor 80(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 144 ] = h9 +# asm 1: movdqu h8=reg128#2 +# asm 2: vpxor 256(h8=%xmm1 +vpxor 256(%r8),%xmm7,%xmm1 + +# qhasm: h8 = h8 ^ mem128[ ptr + 48 ] +# asm 1: vpxor 48(h8=reg128#2 +# asm 2: vpxor 48(h8=%xmm1 +vpxor 48(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 128 ] = h8 +# asm 1: movdqu h7=reg128#2 +# asm 2: vpxor 224(h7=%xmm1 +vpxor 224(%r8),%xmm8,%xmm1 + +# qhasm: h7 = h7 ^ mem128[ ptr + 16 ] +# asm 1: vpxor 16(h7=reg128#2 +# asm 2: vpxor 16(h7=%xmm1 +vpxor 16(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = h7 +# asm 1: movdqu h6=reg128#1 +# asm 2: vpxor 192(h6=%xmm0 +vpxor 192(%r8),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 96 ] = h6 +# asm 1: movdqu h5=reg128#1 +# asm 2: vpxor 160(h5=%xmm0 +vpxor 160(%r8),%xmm3,%xmm0 + +# qhasm: mem128[ input_0 + 80 ] = h5 +# asm 1: movdqu h4=reg128#1 +# asm 2: vpxor 128(h4=%xmm0 +vpxor 128(%r8),%xmm4,%xmm0 + +# qhasm: mem128[ input_0 + 64 ] = h4 +# asm 1: movdqu h3=reg128#1 +# asm 2: vpxor 96(h3=%xmm0 +vpxor 96(%r8),%xmm13,%xmm0 + +# qhasm: mem128[ input_0 + 48 ] = h3 +# asm 1: movdqu h2=reg128#1 +# asm 2: vpxor 64(h2=%xmm0 +vpxor 64(%r8),%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 32 ] = h2 +# asm 1: movdqu h1=reg128#1 +# asm 2: vpxor 32(h1=%xmm0 +vpxor 32(%r8),%xmm10,%xmm0 + +# qhasm: mem128[ input_0 + 16 ] = h1 +# asm 1: movdqu h0=reg128#1 +# asm 2: vpxor 0(h0=%xmm0 +vpxor 0(%r8),%xmm11,%xmm0 + +# qhasm: mem128[ input_0 + 0 ] = h0 +# asm 1: movdqu + +typedef __m256i vec256; + +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_set1_16b(uint16_t a); +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_setzero(void); +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_set4x(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); + +// Extract requires a literal argument so need to be macros +#define PQCLEAN_MCELIECE460896_AVX_vec256_extract2x(a,i) ((vec128) _mm256_extractf128_si256((vec256) (a),(i))) +#define PQCLEAN_MCELIECE460896_AVX_vec256_extract(a,i) ((uint64_t) _mm256_extract_epi64((vec256) (a),(i))) + +int PQCLEAN_MCELIECE460896_AVX_vec256_testz(vec256 a); +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_and(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_xor(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_or(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_sll_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_srl_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_unpack_low(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_unpack_high(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_unpack_low_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_unpack_high_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896_AVX_vec256_or_reduce(const vec256 *a); +void PQCLEAN_MCELIECE460896_AVX_vec256_copy(vec256 *dest, const vec256 *src); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE460896_AVX_vec256_mul(vec256 *h, vec256 *f, const vec256 *g); +void PQCLEAN_MCELIECE460896_AVX_vec256_sq(vec256 * /*out*/, const vec256 * /*in*/); +void PQCLEAN_MCELIECE460896_AVX_vec256_inv(vec256 * /*out*/, const vec256 * /*in*/); + +extern void PQCLEAN_MCELIECE460896_AVX_vec256_maa_asm(vec256 *, vec256 *, const vec256 *); +extern void PQCLEAN_MCELIECE460896_AVX_vec256_ama_asm(vec256 *, vec256 *, const vec256 *); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896_avx/vec256_ama_asm.S b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/vec256_ama_asm.S new file mode 100644 index 0000000000..6a4253faf7 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896_avx/vec256_ama_asm.S @@ -0,0 +1,2322 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 a7 + +# qhasm: reg256 a8 + +# qhasm: reg256 a9 + +# qhasm: reg256 a10 + +# qhasm: reg256 a11 + +# qhasm: reg256 a12 + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: enter vec256_ama_asm +.p2align 5 +.global _PQCLEAN_MCELIECE460896_AVX_vec256_ama_asm +.global PQCLEAN_MCELIECE460896_AVX_vec256_ama_asm +_PQCLEAN_MCELIECE460896_AVX_vec256_ama_asm: +PQCLEAN_MCELIECE460896_AVX_vec256_ama_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rdi),%ymm1 + +# qhasm: a12 = a12 ^ mem256[ input_1 + 384 ] +# asm 1: vpxor 384(a12=reg256#2 +# asm 2: vpxor 384(a12=%ymm1 +vpxor 384(%rsi),%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 384 ] = a12 +# asm 1: vmovupd r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rdi),%ymm14 + +# qhasm: a11 = a11 ^ mem256[ input_1 + 352 ] +# asm 1: vpxor 352(a11=reg256#15 +# asm 2: vpxor 352(a11=%ymm14 +vpxor 352(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 352 ] = a11 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rdi),%ymm14 + +# qhasm: a10 = a10 ^ mem256[ input_1 + 320 ] +# asm 1: vpxor 320(a10=reg256#15 +# asm 2: vpxor 320(a10=%ymm14 +vpxor 320(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 320 ] = a10 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rdi),%ymm14 + +# qhasm: a9 = a9 ^ mem256[ input_1 + 288 ] +# asm 1: vpxor 288(a9=reg256#15 +# asm 2: vpxor 288(a9=%ymm14 +vpxor 288(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 288 ] = a9 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rdi),%ymm14 + +# qhasm: a8 = a8 ^ mem256[ input_1 + 256 ] +# asm 1: vpxor 256(a8=reg256#15 +# asm 2: vpxor 256(a8=%ymm14 +vpxor 256(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 256 ] = a8 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rdi),%ymm14 + +# qhasm: a7 = a7 ^ mem256[ input_1 + 224 ] +# asm 1: vpxor 224(a7=reg256#15 +# asm 2: vpxor 224(a7=%ymm14 +vpxor 224(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 224 ] = a7 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rdi),%ymm14 + +# qhasm: a6 = a6 ^ mem256[ input_1 + 192 ] +# asm 1: vpxor 192(a6=reg256#15 +# asm 2: vpxor 192(a6=%ymm14 +vpxor 192(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 192 ] = a6 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rdi),%ymm14 + +# qhasm: a5 = a5 ^ mem256[ input_1 + 160 ] +# asm 1: vpxor 160(a5=reg256#15 +# asm 2: vpxor 160(a5=%ymm14 +vpxor 160(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 160 ] = a5 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rdi),%ymm14 + +# qhasm: a4 = a4 ^ mem256[ input_1 + 128 ] +# asm 1: vpxor 128(a4=reg256#15 +# asm 2: vpxor 128(a4=%ymm14 +vpxor 128(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 128 ] = a4 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rdi),%ymm14 + +# qhasm: a3 = a3 ^ mem256[ input_1 + 96 ] +# asm 1: vpxor 96(a3=reg256#15 +# asm 2: vpxor 96(a3=%ymm14 +vpxor 96(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 96 ] = a3 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rdi),%ymm14 + +# qhasm: a2 = a2 ^ mem256[ input_1 + 64 ] +# asm 1: vpxor 64(a2=reg256#15 +# asm 2: vpxor 64(a2=%ymm14 +vpxor 64(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 64 ] = a2 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rdi),%ymm14 + +# qhasm: a1 = a1 ^ mem256[ input_1 + 32 ] +# asm 1: vpxor 32(a1=reg256#15 +# asm 2: vpxor 32(a1=%ymm14 +vpxor 32(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 32 ] = a1 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rdi),%ymm14 + +# qhasm: a0 = a0 ^ mem256[ input_1 + 0 ] +# asm 1: vpxor 0(a0=reg256#15 +# asm 2: vpxor 0(a0=%ymm14 +vpxor 0(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 0 ] = a0 +# asm 1: vmovupd r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm2,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm1,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm13,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm12,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm11,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm10,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm9,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm8,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm7,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm6,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm5,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm4,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm3,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rdi),%ymm2,%ymm0 + +# qhasm: mem256[ input_0 + 384 ] = r12 +# asm 1: vmovupd r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rdi),%ymm1,%ymm0 + +# qhasm: mem256[ input_0 + 352 ] = r11 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rdi),%ymm13,%ymm0 + +# qhasm: mem256[ input_0 + 320 ] = r10 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rdi),%ymm12,%ymm0 + +# qhasm: mem256[ input_0 + 288 ] = r9 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rdi),%ymm11,%ymm0 + +# qhasm: mem256[ input_0 + 256 ] = r8 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rdi),%ymm10,%ymm0 + +# qhasm: mem256[ input_0 + 224 ] = r7 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rdi),%ymm9,%ymm0 + +# qhasm: mem256[ input_0 + 192 ] = r6 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rdi),%ymm8,%ymm0 + +# qhasm: mem256[ input_0 + 160 ] = r5 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rdi),%ymm7,%ymm0 + +# qhasm: mem256[ input_0 + 128 ] = r4 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rdi),%ymm6,%ymm0 + +# qhasm: mem256[ input_0 + 96 ] = r3 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rdi),%ymm5,%ymm0 + +# qhasm: mem256[ input_0 + 64 ] = r2 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rdi),%ymm4,%ymm0 + +# qhasm: mem256[ input_0 + 32 ] = r1 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rdi),%ymm3,%ymm0 + +# qhasm: mem256[ input_0 + 0 ] = r0 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t0 = mem64[ input_0 + 192 ] +# asm 1: movq 192(t0=int64#2 +# asm 2: movq 192(t0=%rsi +movq 192(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(t1=int64#3 +# asm 2: movq 200(t1=%rdx +movq 200(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 176(t0=%rsi +movq 176(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(t1=int64#3 +# asm 2: movq 184(t1=%rdx +movq 184(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 160(t0=%rsi +movq 160(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(t1=int64#3 +# asm 2: movq 168(t1=%rdx +movq 168(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 144(t0=%rsi +movq 144(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(t1=int64#3 +# asm 2: movq 152(t1=%rdx +movq 152(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 128(t0=%rsi +movq 128(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(t1=int64#3 +# asm 2: movq 136(t1=%rdx +movq 136(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 112(t0=%rsi +movq 112(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(t1=int64#3 +# asm 2: movq 120(t1=%rdx +movq 120(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 96(t0=%rsi +movq 96(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(t1=int64#3 +# asm 2: movq 104(t1=%rdx +movq 104(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 80(t0=%rsi +movq 80(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(t1=int64#3 +# asm 2: movq 88(t1=%rdx +movq 88(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 64(t0=%rsi +movq 64(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(t1=int64#3 +# asm 2: movq 72(t1=%rdx +movq 72(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 48(t0=%rsi +movq 48(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(t1=int64#3 +# asm 2: movq 56(t1=%rdx +movq 56(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 32(t0=%rsi +movq 32(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(t1=int64#3 +# asm 2: movq 40(t1=%rdx +movq 40(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 16(t0=%rsi +movq 16(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(t1=int64#3 +# asm 2: movq 24(t1=%rdx +movq 24(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 0(t0=%rsi +movq 0(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(t1=int64#1 +# asm 2: movq 8(t1=%rdi +movq 8(%rdi),%rdi + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rsi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE460896F_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/api.h b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/api.h new file mode 100644 index 0000000000..674fa2779c --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/api.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_API_H +#define PQCLEAN_MCELIECE460896F_AVX_API_H + +#include + +#define PQCLEAN_MCELIECE460896F_AVX_CRYPTO_ALGNAME "Classic McEliece 460896f" +#define PQCLEAN_MCELIECE460896F_AVX_CRYPTO_PUBLICKEYBYTES 524160 +#define PQCLEAN_MCELIECE460896F_AVX_CRYPTO_SECRETKEYBYTES 13568 +#define PQCLEAN_MCELIECE460896F_AVX_CRYPTO_CIPHERTEXTBYTES 188 +#define PQCLEAN_MCELIECE460896F_AVX_CRYPTO_BYTES 32 + +int PQCLEAN_MCELIECE460896F_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE460896F_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE460896F_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/benes.c b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/benes.c new file mode 100644 index 0000000000..defa2902f3 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/benes.c @@ -0,0 +1,311 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_x(vec128 *data, vec128 *bits) { + int i; + vec128 v0, v1; + vec128 d; + + for (i = 0; i < 64; i += 2) { + v0 = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_low(data[i + 0], data[i + 1]); + v1 = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_high(data[i + 0], data[i + 1]); + + d = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(v0, v1); + d = PQCLEAN_MCELIECE460896F_AVX_vec128_and(d, *bits++); + v0 = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(v0, d); + v1 = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(v1, d); + + data[i + 0] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_low(v0, v1); + data[i + 1] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_high(v0, v1); + } +} + +static void layer_0(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x ], bs[ x + 1 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, *cond++); + bs[ x ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x ], diff); + bs[ x + 1 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 1 ], diff); + } +} + +static void layer_1(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 2 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 2 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 3 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 3 ], diff); + + cond += 2; + } +} + +static void layer_2(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 4 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 4 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 5 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 5 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[2]); + bs[ x + 2 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 2 ], diff); + bs[ x + 6 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 6 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[3]); + bs[ x + 3 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 3 ], diff); + bs[ x + 7 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ x + 7 ], diff); + + cond += 4; + } +} + +static void layer_3(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 8 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 8 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 9 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 9 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 10 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 10 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 11 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 11 ], diff); + + cond += 4; + } + } +} + +static void layer_4(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 16 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 16 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 17 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 17 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 18 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 18 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 19 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 19 ], diff); + + cond += 4; + } + } +} + +static void layer_5(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 32 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 32 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 33 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 33 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 34 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 34 ], diff); + + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); + diff = PQCLEAN_MCELIECE460896F_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 35 ] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(bs[ s + 35 ], diff); + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: bits_int, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE460896F_AVX_load_bits(vec128 bits_int[][32], const unsigned char *bits) { + int i, j; + const unsigned char *ptr = bits; + + vec128 buf[64]; + + for (i = 0; i <= 5; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(PQCLEAN_MCELIECE460896F_AVX_load8(ptr), PQCLEAN_MCELIECE460896F_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE460896F_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } + + for (i = 6; i <= 18; i++) { + for (j = 0; j < 32; j++) { + bits_int[i][j] = PQCLEAN_MCELIECE460896F_AVX_load16(ptr); + ptr += 16; + } + } + + for (i = 19; i < 25; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(PQCLEAN_MCELIECE460896F_AVX_load8(ptr), PQCLEAN_MCELIECE460896F_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE460896F_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } +} + +/* input: r, sequence of bits to be permuted */ +/* b, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE460896F_AVX_benes(vec128 *r, vec128 b[][32], int rev) { + int inc; + + vec128 *b_ptr; + + if (rev == 0) { + inc = 32; + b_ptr = b[ 0]; + } else { + inc = -32; + b_ptr = b[24]; + } + + // + + PQCLEAN_MCELIECE460896F_AVX_transpose_64x128_sp( r ); + + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE460896F_AVX_transpose_64x128_sp( r ); + + layer_x(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_x(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE460896F_AVX_transpose_64x128_sp( r ); + + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + //b_ptr += inc; + + PQCLEAN_MCELIECE460896F_AVX_transpose_64x128_sp( r ); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/benes.h b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/benes.h new file mode 100644 index 0000000000..5032513ae6 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_BENES_H +#define PQCLEAN_MCELIECE460896F_AVX_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE460896F_AVX_load_bits(vec128 /*bits_int*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE460896F_AVX_benes(vec128 * /*r*/, vec128 /*b*/[][32], int /*rev*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/bm.c b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/bm.c new file mode 100644 index 0000000000..fa18f8f0b1 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/bm.c @@ -0,0 +1,215 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "params.h" +#include "vec128.h" + +#include + +extern gf PQCLEAN_MCELIECE460896F_AVX_vec_reduce_asm(vec128 *); +extern void PQCLEAN_MCELIECE460896F_AVX_update_asm(void *, gf, int); + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline void vec128_cmov(vec128 out[][2], uint16_t mask) { + int i; + + vec128 v0, v1; + + vec128 m0 = PQCLEAN_MCELIECE460896F_AVX_vec128_set1_16b( mask); + vec128 m1 = PQCLEAN_MCELIECE460896F_AVX_vec128_set1_16b(~mask); + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE460896F_AVX_vec128_and(out[i][1], m0); + v1 = PQCLEAN_MCELIECE460896F_AVX_vec128_and(out[i][0], m1); + out[i][0] = PQCLEAN_MCELIECE460896F_AVX_vec128_or(v0, v1); + } +} + +static inline void interleave(vec256 *in, int idx0, int idx1, vec256 *mask, int b) { + int s = 1 << b; + + vec256 x, y; + + x = PQCLEAN_MCELIECE460896F_AVX_vec256_or(PQCLEAN_MCELIECE460896F_AVX_vec256_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE460896F_AVX_vec256_sll_4x(PQCLEAN_MCELIECE460896F_AVX_vec256_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE460896F_AVX_vec256_or(PQCLEAN_MCELIECE460896F_AVX_vec256_srl_4x(PQCLEAN_MCELIECE460896F_AVX_vec256_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE460896F_AVX_vec256_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec256 *in) { + int i, k; + + vec256 mask[4][2]; + vec256 buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE460896F_AVX_vec256_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE460896F_AVX_vec256_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE460896F_AVX_vec256_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE460896F_AVX_vec256_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE460896F_AVX_vec256_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE460896F_AVX_vec256_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE460896F_AVX_vec256_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE460896F_AVX_vec256_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE460896F_AVX_vec256_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE460896F_AVX_vec256_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE460896F_AVX_vec256_extract(buf[i], 1) >> (k * 16)) & GFMASK; + out[ (4 * 2 + k) * 16 + i ] = (PQCLEAN_MCELIECE460896F_AVX_vec256_extract(buf[i], 2) >> (k * 16)) & GFMASK; + out[ (4 * 3 + k) * 16 + i ] = (PQCLEAN_MCELIECE460896F_AVX_vec256_extract(buf[i], 3) >> (k * 16)) & GFMASK; + } + } +} + +typedef union { + vec128 as_128[GFBITS][2]; + vec256 as_256[GFBITS]; +} aligned_double_vec128; + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE460896F_AVX_bm(vec128 *out, vec256 *in) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1; + uint64_t v[2]; + + vec128 prod[ GFBITS ]; + vec128 interval[GFBITS]; + + aligned_double_vec128 db; + aligned_double_vec128 BC_tmp; + aligned_double_vec128 BC; + + gf d, b; + gf coefs[256]; + + // initialization + + get_coefs(coefs, in); + + BC.as_128[0][0] = PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0, one << 62); + BC.as_128[0][1] = PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0, one << 63); + + for (i = 1; i < GFBITS; i++) { + BC.as_128[i][0] = BC.as_128[i][1] = PQCLEAN_MCELIECE460896F_AVX_vec128_setzero(); + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[i] = PQCLEAN_MCELIECE460896F_AVX_vec128_setzero(); + } + + for (N = 0; N < SYS_T * 2; N++) { + PQCLEAN_MCELIECE460896F_AVX_update_asm(interval, coefs[N], 16); + PQCLEAN_MCELIECE460896F_AVX_vec128_mul_asm(prod, interval, BC.as_128[0] + 1, 32); + + d = PQCLEAN_MCELIECE460896F_AVX_vec_reduce_asm(prod); + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + db.as_128[i][0] = PQCLEAN_MCELIECE460896F_AVX_vec128_setbits((d >> i) & 1); + db.as_128[i][1] = PQCLEAN_MCELIECE460896F_AVX_vec128_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE460896F_AVX_vec256_mul(BC_tmp.as_256, db.as_256, BC.as_256); + + vec128_cmov(BC.as_128, mask); + PQCLEAN_MCELIECE460896F_AVX_update_asm(BC.as_128, 0, 32); + + for (i = 0; i < GFBITS; i++) { + BC.as_128[i][1] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(BC_tmp.as_128[i][0], BC_tmp.as_128[i][1]); + } + + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(BC.as_128[i][1], 0); + v[1] = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(BC.as_128[i][1], 1); + + out[i] = PQCLEAN_MCELIECE460896F_AVX_vec128_set2x((v[0] >> 31) | (v[1] << 33), v[1] >> 31); + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/bm.h b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/bm.h new file mode 100644 index 0000000000..b4cf2396de --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/bm.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_BM_H +#define PQCLEAN_MCELIECE460896F_AVX_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "vec128.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE460896F_AVX_bm(vec128 * /*out*/, vec256 * /*in*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/consts.S b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/consts.S new file mode 100644 index 0000000000..be96319337 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/consts.S @@ -0,0 +1,33 @@ +.data + +# not supported on MacOS +#.section .rodata + +.globl PQCLEAN_MCELIECE460896F_AVX_MASK0_0 +.globl PQCLEAN_MCELIECE460896F_AVX_MASK0_1 +.globl PQCLEAN_MCELIECE460896F_AVX_MASK1_0 +.globl PQCLEAN_MCELIECE460896F_AVX_MASK1_1 +.globl PQCLEAN_MCELIECE460896F_AVX_MASK2_0 +.globl PQCLEAN_MCELIECE460896F_AVX_MASK2_1 +.globl PQCLEAN_MCELIECE460896F_AVX_MASK3_0 +.globl PQCLEAN_MCELIECE460896F_AVX_MASK3_1 +.globl PQCLEAN_MCELIECE460896F_AVX_MASK4_0 +.globl PQCLEAN_MCELIECE460896F_AVX_MASK4_1 +.globl PQCLEAN_MCELIECE460896F_AVX_MASK5_0 +.globl PQCLEAN_MCELIECE460896F_AVX_MASK5_1 + +.p2align 5 + +PQCLEAN_MCELIECE460896F_AVX_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE460896F_AVX_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE460896F_AVX_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333, 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE460896F_AVX_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE460896F_AVX_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE460896F_AVX_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE460896F_AVX_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE460896F_AVX_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE460896F_AVX_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE460896F_AVX_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE460896F_AVX_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE460896F_AVX_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/consts.inc b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/consts.inc new file mode 100644 index 0000000000..9d1846bb1b --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/consts.inc @@ -0,0 +1,502 @@ +//64 +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6666666666666666, 0X6666666666666666, 0X6666666666666666, 0X6666666666666666), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A) +}, +//128 +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33, 0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF) +}, +//256 +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0XF00F0FF00FF0F00F, 0XF00F0FF00FF0F00F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9669966969966996, 0X6996699696699669, 0X6996699696699669, 0X9669966969966996), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0X5AA5A55AA55A5AA5, 0X5AA5A55AA55A5AA5, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555) +}, +//512 +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9966669966999966, 0X6699996699666699, 0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9966996699669966, 0X6699669966996699, 0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6699996699666699, 0X9966669966999966, 0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6699669966996699, 0X9966996699669966, 0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +//1024 +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +//2048 +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +//4096 +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/controlbits.c b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/controlbits.c new file mode 100644 index 0000000000..160ff39b12 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE460896F_AVX_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE460896F_AVX_sort_63b(n / 2, x); + PQCLEAN_MCELIECE460896F_AVX_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE460896F_AVX_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/controlbits.h b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/controlbits.h new file mode 100644 index 0000000000..2e0620f03b --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_CONTROLBITS_H +#define PQCLEAN_MCELIECE460896F_AVX_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE460896F_AVX_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE460896F_AVX_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/crypto_hash.h b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/crypto_hash.h new file mode 100644 index 0000000000..476f5111d8 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE460896F_AVX_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/decrypt.c b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/decrypt.c new file mode 100644 index 0000000000..a39557a2cc --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/decrypt.c @@ -0,0 +1,234 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec256 out[][GFBITS], vec256 inv[][GFBITS], const unsigned char *sk, vec256 *recv) { + int i, j; + + vec128 sk_int[ GFBITS ]; + vec256 eval[32][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + // computing inverses + + PQCLEAN_MCELIECE460896F_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE460896F_AVX_fft(eval, sk_int); + + for (i = 0; i < 32; i++) { + PQCLEAN_MCELIECE460896F_AVX_vec256_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE460896F_AVX_vec256_copy(inv[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE460896F_AVX_vec256_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE460896F_AVX_vec256_inv(tmp, inv[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE460896F_AVX_vec256_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE460896F_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE460896F_AVX_vec256_copy(inv[0], tmp); + + // + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE460896F_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + unsigned char r[ 1024 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 1024; i++) { + r[i] = 0; + } + + for (i = 0; i < 64; i++) { + recv[i] = PQCLEAN_MCELIECE460896F_AVX_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 64; i++) { + v[0] = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE460896F_AVX_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE460896F_AVX_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec256 out[][GFBITS], vec256 inv[][GFBITS], vec256 *recv) { + int i, j; + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE460896F_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 64; i++) { + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE460896F_AVX_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE460896F_AVX_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u64( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec256 *s0, vec256 *s1) { + int i; + vec256 diff; + + diff = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(s0[0], s1[0]); + + for (i = 1; i < GFBITS; i++) { + diff = PQCLEAN_MCELIECE460896F_AVX_vec256_or(diff, PQCLEAN_MCELIECE460896F_AVX_vec256_xor(s0[i], s1[i])); + } + + return (uint16_t)PQCLEAN_MCELIECE460896F_AVX_vec256_testz(diff); +} + +static void reformat_128to256(vec256 *out, vec128 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(in[2 * i + 0], 0); + v[1] = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(in[2 * i + 0], 1); + v[2] = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(in[2 * i + 1], 0); + v[3] = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(in[2 * i + 1], 1); + + out[i] = PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +static void reformat_256to128(vec128 *out, vec256 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i], 3); + + out[2 * i + 0] = PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(v[0], v[1]); + out[2 * i + 1] = PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(v[2], v[3]); + } +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE460896F_AVX_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec256 inv[ 32 ][ GFBITS ]; + vec256 scaled[ 32 ][ GFBITS ]; + vec256 eval[32][ GFBITS ]; + + vec128 error128[ 64 ]; + vec256 error256[ 32 ]; + + vec256 s_priv[ GFBITS ]; + vec256 s_priv_cmp[ GFBITS ]; + vec128 locator[ GFBITS ]; + + vec128 recv128[ 64 ]; + vec256 recv256[ 32 ]; + vec256 allone; + + vec128 bits_int[25][32]; + + // Berlekamp decoder + + preprocess(recv128, c); + + PQCLEAN_MCELIECE460896F_AVX_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE460896F_AVX_benes(recv128, bits_int, 1); + + reformat_128to256(recv256, recv128); + + scaling(scaled, inv, sk, recv256); + PQCLEAN_MCELIECE460896F_AVX_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE460896F_AVX_bm(locator, s_priv); + + PQCLEAN_MCELIECE460896F_AVX_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE460896F_AVX_vec256_set1_16b(0xFFFF); + + for (i = 0; i < 32; i++) { + error256[i] = PQCLEAN_MCELIECE460896F_AVX_vec256_or_reduce(eval[i]); + error256[i] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(error256[i], allone); + } + + scaling_inv(scaled, inv, error256); + PQCLEAN_MCELIECE460896F_AVX_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + reformat_256to128(error128, error256); + PQCLEAN_MCELIECE460896F_AVX_benes(error128, bits_int, 0); + + postprocess(e, error128); + + check_weight = weight_check(e, error128); + + return 1 - (check_synd & check_weight); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/decrypt.h b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/decrypt.h new file mode 100644 index 0000000000..c4d05ca532 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_DECRYPT_H +#define PQCLEAN_MCELIECE460896F_AVX_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE460896F_AVX_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/encrypt.c b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/encrypt.c new file mode 100644 index 0000000000..162b5c663b --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/encrypt.c @@ -0,0 +1,99 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "int32_sort.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE460896F_AVX_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + int i, j, eq, count; + + uint16_t ind[ SYS_T * 2 ]; + int32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((uint8_t *) ind, sizeof(ind)); + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + PQCLEAN_MCELIECE460896F_AVX_int32_sort(ind32, SYS_T); + + eq = 0; + for (i = 1; i < SYS_T; i++) { + if (ind32[i - 1] == ind32[i]) { + eq = 1; + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + PQCLEAN_MCELIECE460896F_AVX_store8(e, e_int[i]); + e += 8; + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE460896F_AVX_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + PQCLEAN_MCELIECE460896F_AVX_syndrome_asm(s, pk, e); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/encrypt.h b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/encrypt.h new file mode 100644 index 0000000000..17a83e3a74 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_ENCRYPT_H +#define PQCLEAN_MCELIECE460896F_AVX_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE460896F_AVX_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/fft.c b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/fft.c new file mode 100644 index 0000000000..e2cf502e3a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/fft.c @@ -0,0 +1,262 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" +#include "vec256.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec128 *in) { + int i, j, k; + vec128 t; + uint64_t v0, v1; + + const vec128 mask[5][2] = { + { + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0x8888888888888888, 0x8888888888888888), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + } + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + v1 = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(in[i], 1); + v1 ^= v1 >> 32; + v0 = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(in[i], 0); + v0 ^= v1 << 32; + in[i] = PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(v0, v1); + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + t = PQCLEAN_MCELIECE460896F_AVX_vec128_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE460896F_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(in[i], t); + + t = PQCLEAN_MCELIECE460896F_AVX_vec128_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE460896F_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(in[i], t); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE460896F_AVX_vec128_mul(in, in, s[j]); // scaling + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec256 out[][ GFBITS ], vec128 *in) { + int i, j, k, s, b; + + vec128 tmp[ GFBITS ]; + vec256 tmp0[ GFBITS ]; + vec256 tmp1[ GFBITS ]; + vec128 t[ GFBITS ]; + + union { + vec128 v[8][ GFBITS + 1 ]; + vec256 V[8][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + uint64_t v0, v1; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 2; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; + + // boradcast + + for (j = 0; j < GFBITS; j++) { + t[j] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_high(in[j], in[j]); + } + + for (i = 0; i < 8; i += 2) { + for (j = 0; j < GFBITS; j++) { + v0 = (beta[i + 0] >> j) & 1; + v0 = -v0; + v1 = (beta[i + 1] >> j) & 1; + v1 = -v1; + + tmp[j] = PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(v0, v1); + } + + PQCLEAN_MCELIECE460896F_AVX_vec128_mul(tmp, t, tmp); + + for (j = 0; j < GFBITS; j++) { + pre.v[i + 0][j] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_low(tmp[j], tmp[j]); + pre.v[i + 1][j] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_high(tmp[j], tmp[j]); + } + } + + for (i = 0; i < GFBITS; i += 2) { + if (i != GFBITS - 1) { + buf.v[0][1] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_low(in[i + 1], in[i + 1] ^ pre.v[6][i + 1]); + } + buf.v[0][0] = PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_low(in[i + 0], in[i + 0] ^ pre.v[6][i + 0]); + + buf.V[1] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[0], pre.V[0][i / 2]); + buf.V[16] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[0], pre.V[4][i / 2]); + buf.V[3] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[1], pre.V[1][i / 2]); + buf.V[48] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[16], pre.V[5][i / 2]); + buf.V[49] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[48], pre.V[0][i / 2]); + buf.V[2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[0], pre.V[1][i / 2]); + buf.V[51] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[49], pre.V[1][i / 2]); + buf.V[6] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[2], pre.V[2][i / 2]); + buf.V[50] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[51], pre.V[0][i / 2]); + buf.V[7] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[6], pre.V[0][i / 2]); + buf.V[54] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[50], pre.V[2][i / 2]); + buf.V[5] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[7], pre.V[1][i / 2]); + buf.V[55] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[54], pre.V[0][i / 2]); + buf.V[53] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[55], pre.V[1][i / 2]); + buf.V[4] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[0], pre.V[2][i / 2]); + buf.V[52] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[53], pre.V[0][i / 2]); + buf.V[12] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[4], pre.V[3][i / 2]); + buf.V[60] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[52], pre.V[3][i / 2]); + buf.V[13] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[12], pre.V[0][i / 2]); + buf.V[61] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[60], pre.V[0][i / 2]); + buf.V[15] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[13], pre.V[1][i / 2]); + buf.V[63] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[61], pre.V[1][i / 2]); + buf.V[14] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[15], pre.V[0][i / 2]); + buf.V[62] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[63], pre.V[0][i / 2]); + buf.V[10] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[14], pre.V[2][i / 2]); + buf.V[58] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[62], pre.V[2][i / 2]); + buf.V[11] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[10], pre.V[0][i / 2]); + buf.V[59] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[58], pre.V[0][i / 2]); + buf.V[9] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[11], pre.V[1][i / 2]); + buf.V[57] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[59], pre.V[1][i / 2]); + buf.V[56] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[57], pre.V[0][i / 2]); + buf.V[8] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[0], pre.V[3][i / 2]); + buf.V[40] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[56], pre.V[4][i / 2]); + buf.V[24] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[8], pre.V[4][i / 2]); + buf.V[41] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[40], pre.V[0][i / 2]); + buf.V[25] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[24], pre.V[0][i / 2]); + buf.V[43] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[41], pre.V[1][i / 2]); + buf.V[27] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[25], pre.V[1][i / 2]); + buf.V[42] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[43], pre.V[0][i / 2]); + buf.V[26] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[27], pre.V[0][i / 2]); + buf.V[46] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[42], pre.V[2][i / 2]); + buf.V[30] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[26], pre.V[2][i / 2]); + buf.V[47] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[46], pre.V[0][i / 2]); + buf.V[31] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[30], pre.V[0][i / 2]); + buf.V[45] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[47], pre.V[1][i / 2]); + buf.V[29] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[31], pre.V[1][i / 2]); + buf.V[44] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[45], pre.V[0][i / 2]); + buf.V[28] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[29], pre.V[0][i / 2]); + buf.V[36] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[44], pre.V[3][i / 2]); + buf.V[20] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[28], pre.V[3][i / 2]); + buf.V[37] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[36], pre.V[0][i / 2]); + buf.V[21] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[20], pre.V[0][i / 2]); + buf.V[39] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[37], pre.V[1][i / 2]); + buf.V[23] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[21], pre.V[1][i / 2]); + buf.V[38] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[39], pre.V[0][i / 2]); + buf.V[22] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[23], pre.V[0][i / 2]); + buf.V[34] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[38], pre.V[2][i / 2]); + buf.V[18] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[22], pre.V[2][i / 2]); + buf.V[35] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[34], pre.V[0][i / 2]); + buf.V[19] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[18], pre.V[0][i / 2]); + buf.V[33] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[35], pre.V[1][i / 2]); + buf.V[17] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[19], pre.V[1][i / 2]); + buf.V[32] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[33], pre.V[0][i / 2]); + + // transpose + + PQCLEAN_MCELIECE460896F_AVX_transpose_64x256_sp(buf.V); + + for (j = 0; j < 32; j++) { + if (i != GFBITS - 1) { + out[j][i + 1] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_high(buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + out[j][i + 0] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_low (buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + } + + // butterflies + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + tmp0[b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_low (out[k][b], out[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + tmp1[b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_high (out[k][b], out[k + 1][b]); + } + + PQCLEAN_MCELIECE460896F_AVX_vec256_maa_asm(tmp0, tmp1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_low (tmp0[b], tmp1[b]); + } + for (b = 0; b < GFBITS; b++) { + out[k + 1][b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_high (tmp0[b], tmp1[b]); + } + } + + for (i = 0; i <= 4; i++) { + s = 1 << i; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE460896F_AVX_vec256_maa_asm(out[k], out[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + consts_ptr += (1 << i); + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE460896F_AVX_fft(vec256 out[][GFBITS], vec128 *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/fft.h b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/fft.h new file mode 100644 index 0000000000..cc1e3abe7f --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/fft.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_FFT_H +#define PQCLEAN_MCELIECE460896F_AVX_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include "params.h" +#include "vec128.h" +#include "vec256.h" +#include + +void PQCLEAN_MCELIECE460896F_AVX_fft(vec256 /*out*/[][GFBITS], vec128 * /*in*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/fft_tr.c b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/fft_tr.c new file mode 100644 index 0000000000..dc0533fd49 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/fft_tr.c @@ -0,0 +1,398 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec256 *in) { + int i, j, k; + vec256 t; + uint64_t v[4]; + + const vec256 mask[6][2] = { + { + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0x2222222222222222, 0x2222222222222222, 0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0x4444444444444444, 0x4444444444444444, 0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0x3030303030303030, 0x3030303030303030, 0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000) + }, + { + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF) + } + }; + + const vec256 s[6][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE460896F_AVX_vec256_mul(in, in, s[j]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + t = PQCLEAN_MCELIECE460896F_AVX_vec256_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE460896F_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(in[i], t); + + t = PQCLEAN_MCELIECE460896F_AVX_vec256_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE460896F_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(in[i], t); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i], 3); + + v[1] ^= v[0] >> 32; + v[1] ^= v[1] << 32; + v[3] ^= v[2] >> 32; + v[3] ^= v[3] << 32; + + in[i] = PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i], 3); + + v[2] ^= v[1]; + v[3] ^= v[2]; + + in[i] = PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + + } +} + +static void butterflies_tr(vec256 *out, vec256 in[][ GFBITS ]) { + int i, j, k, s, b; + + vec256 t0[ GFBITS ]; + vec256 t1[ GFBITS ]; + vec256 t; + + vec128 out128[ GFBITS ][ 2 ]; + vec128 tmp[ GFBITS ]; + + union { + vec128 v[6][ GFBITS + 1 ]; + vec256 V[6][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t v[4]; + uint64_t consts_ptr = 33; + + const unsigned char reversal[] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // butterflies + + for (i = 4; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE460896F_AVX_vec256_ama_asm(in[k], in[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + } + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_low(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_high(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE460896F_AVX_vec256_ama_asm(t0, t1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + in[k][b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_low(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_high(t0[b], t1[b]); + } + + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_low_2x(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_high_2x(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE460896F_AVX_vec256_ama_asm(t0, t1, consts[0]); + + for (b = 0; b < GFBITS; b++) { + in[k + 0][b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_low_2x(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_high_2x(t0[b], t1[b]); + } + } + + + // boradcast + + for (i = 0; i < GFBITS; i += 2) { + // transpose + + for (k = 0; k < 32; k++) { + if (i != GFBITS - 1) { + buf.v[ reversal[2 * k + 0] ][1] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract2x(in[ k ][i + 1], 0); + buf.v[ reversal[2 * k + 1] ][1] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract2x(in[ k ][i + 1], 1); + } + + buf.v[ reversal[2 * k + 0] ][0] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract2x(in[ k ][i + 0], 0); + buf.v[ reversal[2 * k + 1] ][0] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract2x(in[ k ][i + 0], 1); + } + + PQCLEAN_MCELIECE460896F_AVX_transpose_64x256_sp(buf.V); + + // + + pre.V[0][i / 2] = buf.V[32]; + buf.V[33] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[33], buf.V[32]); + pre.V[1][i / 2] = buf.V[33]; + buf.V[35] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[35], buf.V[33]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[35]); + buf.V[34] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[34], buf.V[35]); + pre.V[2][i / 2] = buf.V[34]; + buf.V[38] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[38], buf.V[34]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[38]); + buf.V[39] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[39], buf.V[38]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[39]); + buf.V[37] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[37], buf.V[39]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[37]); + buf.V[36] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[36], buf.V[37]); + pre.V[3][i / 2] = buf.V[36]; + buf.V[44] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[44], buf.V[36]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[44]); + buf.V[45] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[45], buf.V[44]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[45]); + buf.V[47] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[47], buf.V[45]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[47]); + buf.V[46] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[46], buf.V[47]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[46]); + buf.V[42] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[42], buf.V[46]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[42]); + buf.V[43] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[43], buf.V[42]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[43]); + buf.V[41] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[41], buf.V[43]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[41]); + buf.V[40] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[40], buf.V[41]); + pre.V[4][i / 2] = buf.V[40]; + buf.V[56] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[56], buf.V[40]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[56]); + buf.V[57] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[57], buf.V[56]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[57]); + buf.V[59] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[59], buf.V[57]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[59]); + buf.V[58] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[58], buf.V[59]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[58]); + buf.V[62] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[62], buf.V[58]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[62]); + buf.V[63] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[63], buf.V[62]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[63]); + buf.V[61] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[61], buf.V[63]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[61]); + buf.V[60] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[60], buf.V[61]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[60]); + buf.V[52] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[52], buf.V[60]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[52]); + buf.V[53] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[53], buf.V[52]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[53]); + buf.V[55] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[55], buf.V[53]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[55]); + buf.V[54] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[54], buf.V[55]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[54]); + buf.V[50] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[50], buf.V[54]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[50]); + buf.V[51] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[51], buf.V[50]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[51]); + buf.V[49] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[49], buf.V[51]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[49]); + buf.V[48] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[48], buf.V[49]); + pre.V[5][i / 2] = buf.V[48]; + buf.V[16] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[16], buf.V[48]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[16]); + buf.V[17] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[17], buf.V[16]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[17]); + buf.V[19] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[19], buf.V[17]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[19]); + buf.V[18] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[18], buf.V[19]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[18]); + buf.V[22] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[22], buf.V[18]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[22]); + buf.V[23] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[23], buf.V[22]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[23]); + buf.V[21] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[21], buf.V[23]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[21]); + buf.V[20] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[20], buf.V[21]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[20]); + buf.V[28] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[28], buf.V[20]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[28]); + buf.V[29] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[29], buf.V[28]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[29]); + buf.V[31] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[31], buf.V[29]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[31]); + buf.V[30] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[30], buf.V[31]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[30]); + buf.V[26] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[26], buf.V[30]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[26]); + buf.V[27] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[27], buf.V[26]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[27]); + buf.V[25] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[25], buf.V[27]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[25]); + buf.V[24] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[24], buf.V[25]); + pre.V[4][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[4][i / 2], buf.V[24]); + buf.V[8] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[8], buf.V[24]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[8]); + buf.V[9] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[9], buf.V[8]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[9]); + buf.V[11] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[11], buf.V[9]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[11]); + buf.V[10] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[10], buf.V[11]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[10]); + buf.V[14] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[14], buf.V[10]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[14]); + buf.V[15] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[15], buf.V[14]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[15]); + buf.V[13] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[13], buf.V[15]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[13]); + buf.V[12] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[12], buf.V[13]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[12]); + buf.V[4] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[4], buf.V[12]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[4]); + buf.V[5] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[5], buf.V[4]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[5]); + buf.V[7] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[7], buf.V[5]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[7]); + buf.V[6] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[6], buf.V[7]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[6]); + buf.V[2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[2], buf.V[6]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[2]); + buf.V[3] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[3], buf.V[2]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[3]); + buf.V[1] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[1], buf.V[3]); + + pre.V[0][i / 2] = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[1]); + t = PQCLEAN_MCELIECE460896F_AVX_vec256_xor(buf.V[0], buf.V[1]); + + if (i != GFBITS - 1) { + out128[i + 1][0] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract2x(t, 1); + } + out128[i + 0][0] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract2x(t, 0); + + } + + // + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE460896F_AVX_vec128_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE460896F_AVX_vec128_mul(tmp, pre.v[0], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = tmp[b]; + } + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE460896F_AVX_vec128_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE460896F_AVX_vec128_mul(tmp, pre.v[i], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(out128[b][1], tmp[b]); + } + } + + for (b = 0; b < GFBITS; b++) { + v[0] = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(out128[b][0], 0); + v[1] = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(out128[b][0], 1); + v[2] = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(out128[b][1], 0); + v[3] = PQCLEAN_MCELIECE460896F_AVX_vec128_extract(out128[b][1], 1); + + out[b] = PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +/* justifying the length of the output */ +static void postprocess(vec256 *out) { + int i; + uint64_t v[4]; + + for (i = 0; i < 13; i++) { + v[0] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(out[i], 0); + v[1] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(out[i], 1); + v[2] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(out[i], 2); + v[3] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(out[i], 3); + + v[3] = 0; + + out[i] = PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +void PQCLEAN_MCELIECE460896F_AVX_fft_tr(vec256 *out, vec256 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); + + postprocess(out); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/fft_tr.h b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/fft_tr.h new file mode 100644 index 0000000000..b43cd4483f --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_FFT_TR_H +#define PQCLEAN_MCELIECE460896F_AVX_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE460896F_AVX_fft_tr(vec256 * /*out*/, vec256 /*in*/[][ GFBITS ]); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/gf.c b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/gf.c new file mode 100644 index 0000000000..790d218c62 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/gf.c @@ -0,0 +1,205 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE460896F_AVX_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* field multiplication */ +gf PQCLEAN_MCELIECE460896F_AVX_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* square twice */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE460896F_AVX_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // ^11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // ^11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // ^111111111111 + + return gf_sqmul(out, num); // ^1111111111110 = ^-1 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE460896F_AVX_gf_inv(gf in) { + return PQCLEAN_MCELIECE460896F_AVX_gf_frac(in, ((gf) 1)); +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE460896F_AVX_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[191]; + + for (i = 0; i < 191; i++) { + prod[i] = 0; + } + + for (i = 0; i < 96; i++) { + for (j = 0; j < 96; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE460896F_AVX_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 190; i >= 96; i--) { + prod[i - 85] ^= PQCLEAN_MCELIECE460896F_AVX_gf_mul(prod[i], (gf) 714); + prod[i - 91] ^= PQCLEAN_MCELIECE460896F_AVX_gf_mul(prod[i], (gf) 5296); + prod[i - 92] ^= PQCLEAN_MCELIECE460896F_AVX_gf_mul(prod[i], (gf) 728); + prod[i - 96] ^= PQCLEAN_MCELIECE460896F_AVX_gf_mul(prod[i], (gf) 5881); + } + + for (i = 0; i < 96; i++) { + out[i] = prod[i]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/gf.h b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/gf.h new file mode 100644 index 0000000000..67987f8b18 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_GF_H +#define PQCLEAN_MCELIECE460896F_AVX_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE460896F_AVX_gf_iszero(gf a); +gf PQCLEAN_MCELIECE460896F_AVX_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE460896F_AVX_gf_mul(gf in0, gf in1); +uint64_t PQCLEAN_MCELIECE460896F_AVX_gf_mul2(gf a, gf b0, gf b1); +gf PQCLEAN_MCELIECE460896F_AVX_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE460896F_AVX_gf_inv(gf in); + +void PQCLEAN_MCELIECE460896F_AVX_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/int32_sort.c b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/int32_sort.c new file mode 100644 index 0000000000..d8c2b1c8a5 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/int32_sort.c @@ -0,0 +1,1211 @@ +#include "int32_sort.h" +#define int32 int32_t + +#include + +typedef __m256i int32x8; +#define int32x8_load(z) _mm256_loadu_si256((__m256i_u *) (z)) +#define int32x8_store(z,i) _mm256_storeu_si256((__m256i_u *) (z),(i)) +#define int32x8_min _mm256_min_epi32 +#define int32x8_max _mm256_max_epi32 + +#define int32x8_MINMAX(a,b) \ + do { \ + int32x8 c = int32x8_min(a, b); \ + (b) = int32x8_max(a,b); \ + (a) = c; \ + } while(0) + +static inline void int32_MINMAX(int32 *a, int32 *b) { + int32 ab = *b ^ *a; + int32 c = *b - *a; + c ^= ab & (c ^ *b); + c >>= 31; + c &= ab; + *a ^= c; + *b ^= c; +} + +static void minmax_vector(int32 *x, int32 *y, size_t n) { + if (n < 8) { + while (n > 0) { + int32_MINMAX(x, y); + ++x; + ++y; + --n; + } + return; + } + if (n & 7) { + int32x8 x0 = int32x8_load(x + n - 8); + int32x8 y0 = int32x8_load(y + n - 8); + int32x8_MINMAX(x0, y0); + int32x8_store(x + n - 8, x0); + int32x8_store(y + n - 8, y0); + n &= ~7; + } + do { + int32x8 x0 = int32x8_load(x); + int32x8 y0 = int32x8_load(y); + int32x8_MINMAX(x0, y0); + int32x8_store(x, x0); + int32x8_store(y, y0); + x += 8; + y += 8; + n -= 8; + } while (n); +} + +/* stages 8,4,2,1 of size-16 bitonic merging */ +static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) { + int32x8 b0, b1, c0, c1, mask; + + int32x8_MINMAX(x0, x1); + + b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + + int32x8_MINMAX(b0, b1); + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */ + + x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */ + x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */ + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + } + + int32x8_store(&x[0], x0); + int32x8_store(&x[8], x1); +} + +/* stages 64,32 of bitonic merging; n is multiple of 128 */ +static void int32_twostages_32(int32 *x, size_t n) { + size_t i; + + while (n > 0) { + for (i = 0; i < 32; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + 32]); + int32x8 x2 = int32x8_load(&x[i + 64]); + int32x8 x3 = int32x8_load(&x[i + 96]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 32], x1); + int32x8_store(&x[i + 64], x2); + int32x8_store(&x[i + 96], x3); + } + x += 128; + n -= 128; + } +} + +/* stages 4q,2q,q of bitonic merging */ +static size_t int32_threestages(int32 *x, size_t n, size_t q) { + size_t k, i; + + for (k = 0; k + 8 * q <= n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + + return k; +} + +/* n is a power of 2; n >= 8; if n == 8 then flagdown */ +// NOLINTNEXTLINE(google-readability-function-size) +static void int32_sort_2power(int32 *x, size_t n, int flagdown) { + size_t p, q, i, j, k; + int32x8 mask; + + if (n == 8) { + int32 x0 = x[0]; + int32 x1 = x[1]; + int32 x2 = x[2]; + int32 x3 = x[3]; + int32 x4 = x[4]; + int32 x5 = x[5]; + int32 x6 = x[6]; + int32 x7 = x[7]; + + /* odd-even sort instead of bitonic sort */ + + int32_MINMAX(&x1, &x0); + int32_MINMAX(&x3, &x2); + int32_MINMAX(&x2, &x0); + int32_MINMAX(&x3, &x1); + int32_MINMAX(&x2, &x1); + + int32_MINMAX(&x5, &x4); + int32_MINMAX(&x7, &x6); + int32_MINMAX(&x6, &x4); + int32_MINMAX(&x7, &x5); + int32_MINMAX(&x6, &x5); + + int32_MINMAX(&x4, &x0); + int32_MINMAX(&x6, &x2); + int32_MINMAX(&x4, &x2); + + int32_MINMAX(&x5, &x1); + int32_MINMAX(&x7, &x3); + int32_MINMAX(&x5, &x3); + + int32_MINMAX(&x2, &x1); + int32_MINMAX(&x4, &x3); + int32_MINMAX(&x6, &x5); + + x[0] = x0; + x[1] = x1; + x[2] = x2; + x[3] = x3; + x[4] = x4; + x[5] = x5; + x[6] = x6; + x[7] = x7; + return; + } + + if (n == 16) { + int32x8 x0, x1, b0, b1, c0, c1; + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + + mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1); + + x0 ^= mask; /* A01234567 */ + x1 ^= mask; /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + c0 ^= mask; + c1 ^= mask; + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + b0 ^= mask; + b1 ^= mask; + + c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */ + c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */ + b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + mask = _mm256_set1_epi32(-1); + if (flagdown) { + x1 ^= mask; + } else { + x0 ^= mask; + } + + merge16_finish(x, x0, x1, flagdown); + return; + } + + if (n == 32) { + int32x8 x0, x1, x2, x3; + + int32_sort_2power(x, 16, 1); + int32_sort_2power(x + 16, 16, 0); + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + x2 = int32x8_load(&x[16]); + x3 = int32x8_load(&x[24]); + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + } + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + + merge16_finish(x, x0, x1, flagdown); + merge16_finish(x + 16, x2, x3, flagdown); + return; + } + + p = n >> 3; + for (i = 0; i < p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x2 = int32x8_load(&x[i + 2 * p]); + int32x8 x4 = int32x8_load(&x[i + 4 * p]); + int32x8 x6 = int32x8_load(&x[i + 6 * p]); + + /* odd-even stage instead of bitonic stage */ + + int32x8_MINMAX(x4, x0); + int32x8_MINMAX(x6, x2); + int32x8_MINMAX(x2, x0); + int32x8_MINMAX(x6, x4); + int32x8_MINMAX(x2, x4); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 2 * p], x2); + int32x8_store(&x[i + 4 * p], x4); + int32x8_store(&x[i + 6 * p], x6); + + int32x8 x1 = int32x8_load(&x[i + p]); + int32x8 x3 = int32x8_load(&x[i + 3 * p]); + int32x8 x5 = int32x8_load(&x[i + 5 * p]); + int32x8 x7 = int32x8_load(&x[i + 7 * p]); + + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x5, x3); + + int32x8_store(&x[i + p], x1); + int32x8_store(&x[i + 3 * p], x3); + int32x8_store(&x[i + 5 * p], x5); + int32x8_store(&x[i + 7 * p], x7); + } + + if (n >= 128) { + int flip, flipflip; + + mask = _mm256_set1_epi32(-1); + + for (j = 0; j < n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 16]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 16], x1); + } + + p = 8; + for (;;) { /* for p in [8, 16, ..., n/16] */ + q = p >> 1; + while (q >= 128) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + if (q == 64) { + int32_twostages_32(x, n); + q = 16; + } + if (q == 32) { + q = 8; + for (k = 0; k < n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q = 4; + } + if (q == 16) { + q = 8; + for (k = 0; k < n; k += 4 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q = 4; + } + if (q == 8) { + for (k = 0; k < n; k += q + q) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + } + } + + q = n >> 3; + flip = 0; + if (p << 1 == q) { + flip = 1; + } + flipflip = 1 - flip; + for (j = 0; j < q; j += p + p) { + for (k = j; k < j + p + p; k += p) { + for (i = k; i < k + p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + if (flip) { + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + x4 ^= mask; + x5 ^= mask; + x6 ^= mask; + x7 ^= mask; + } + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + flip ^= 1; + } + flip ^= flipflip; + } + + if (p << 4 == n) { + break; + } + p <<= 1; + } + } + + for (p = 4; p >= 1; p >>= 1) { + int32 *z = x; + int32 *target = x + n; + if (p == 4) { + mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&z[0], x0); + int32x8_store(&z[8], x1); + z += 16; + } + } else if (p == 2) { + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8_MINMAX(b0, b1); + int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20); + int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31); + int32x8_store(&z[0], c0); + int32x8_store(&z[8], c1); + z += 16; + } + } else { /* p == 1 */ + mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */ + int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */ + int32x8_MINMAX(d0, d1); + int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20); + int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31); + int32x8_store(&z[0], e0); + int32x8_store(&z[8], e1); + z += 16; + } + } + + q = n >> 4; + while (q >= 128 || q == 32) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (k = j; k < j + q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += 2 * q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (k = 0; k < q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + int32x8 x4 = int32x8_load(&x[k + 4 * q]); + int32x8 x5 = int32x8_load(&x[k + 5 * q]); + int32x8 x6 = int32x8_load(&x[k + 6 * q]); + int32x8 x7 = int32x8_load(&x[k + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + int32x8_store(&x[k + 4 * q], x4); + int32x8_store(&x[k + 5 * q], x5); + int32x8_store(&x[k + 6 * q], x6); + int32x8_store(&x[k + 7 * q], x7); + } + } + + /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */ + mask = _mm256_set1_epi32(-1); + + for (i = 0; i < n; i += 64) { + int32x8 a0 = int32x8_load(&x[i]); + int32x8 a1 = int32x8_load(&x[i + 8]); + int32x8 a2 = int32x8_load(&x[i + 16]); + int32x8 a3 = int32x8_load(&x[i + 24]); + int32x8 a4 = int32x8_load(&x[i + 32]); + int32x8 a5 = int32x8_load(&x[i + 40]); + int32x8 a6 = int32x8_load(&x[i + 48]); + int32x8 a7 = int32x8_load(&x[i + 56]); + + int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */ + int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */ + int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */ + int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */ + int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */ + int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */ + int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */ + + if (flagdown) { + c2 ^= mask; + c3 ^= mask; + c6 ^= mask; + c7 ^= mask; + } else { + c0 ^= mask; + c1 ^= mask; + c4 ^= mask; + c5 ^= mask; + } + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */ + int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */ + int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */ + + int32x8_MINMAX(d0, d1); + int32x8_MINMAX(d2, d3); + int32x8_MINMAX(d4, d5); + int32x8_MINMAX(d6, d7); + int32x8_MINMAX(d0, d2); + int32x8_MINMAX(d1, d3); + int32x8_MINMAX(d4, d6); + int32x8_MINMAX(d5, d7); + int32x8_MINMAX(d0, d4); + int32x8_MINMAX(d1, d5); + int32x8_MINMAX(d2, d6); + int32x8_MINMAX(d3, d7); + + int32x8 e0 = _mm256_unpacklo_epi32(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi32(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi32(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi32(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi32(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi32(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi32(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi32(d6, d7); + + int32x8 f0 = _mm256_unpacklo_epi64(e0, e2); + int32x8 f1 = _mm256_unpacklo_epi64(e1, e3); + int32x8 f2 = _mm256_unpackhi_epi64(e0, e2); + int32x8 f3 = _mm256_unpackhi_epi64(e1, e3); + int32x8 f4 = _mm256_unpacklo_epi64(e4, e6); + int32x8 f5 = _mm256_unpacklo_epi64(e5, e7); + int32x8 f6 = _mm256_unpackhi_epi64(e4, e6); + int32x8 f7 = _mm256_unpackhi_epi64(e5, e7); + + int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20); + int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20); + int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20); + int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20); + int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31); + int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31); + int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31); + int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31); + + int32x8_store(&x[i], g0); + int32x8_store(&x[i + 8], g1); + int32x8_store(&x[i + 16], g2); + int32x8_store(&x[i + 24], g3); + int32x8_store(&x[i + 32], g4); + int32x8_store(&x[i + 40], g5); + int32x8_store(&x[i + 48], g6); + int32x8_store(&x[i + 56], g7); + } + + q = n >> 4; + while (q >= 128 || q == 32) { + q >>= 2; + for (j = 0; j < n; j += 8 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q >>= 1; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += q + q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (i = 0; i < q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */ + int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */ + int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */ + int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */ + int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */ + int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */ + int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */ + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */ + int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */ + int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */ + + if (flagdown) { + d0 ^= mask; + d1 ^= mask; + d2 ^= mask; + d3 ^= mask; + d4 ^= mask; + d5 ^= mask; + d6 ^= mask; + d7 ^= mask; + } + + int32x8_store(&x[i], d0); + int32x8_store(&x[i + q], d4); + int32x8_store(&x[i + 2 * q], d1); + int32x8_store(&x[i + 3 * q], d5); + int32x8_store(&x[i + 4 * q], d2); + int32x8_store(&x[i + 5 * q], d6); + int32x8_store(&x[i + 6 * q], d3); + int32x8_store(&x[i + 7 * q], d7); + } +} + +void PQCLEAN_MCELIECE460896F_AVX_int32_sort(int32 *x, size_t n) { + size_t q, i, j; + + if (n <= 8) { + if (n == 8) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + int32_MINMAX(&x[6], &x[7]); + } + if (n >= 7) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + } + if (n >= 6) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + } + if (n >= 5) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + } + if (n >= 4) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + } + if (n >= 3) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + } + if (n >= 2) { + int32_MINMAX(&x[0], &x[1]); + } + return; + } + + if (!(n & (n - 1))) { + int32_sort_2power(x, n, 0); + return; + } + + q = 8; + while (q < n - q) { + q += q; + } + /* n > q >= 8 */ + + if (q <= 128) { /* n <= 256 */ + int32x8 y[32]; + for (i = q >> 3; i < q >> 2; ++i) { + y[i] = _mm256_set1_epi32(0x7fffffff); + } + for (i = 0; i < n; ++i) { + ((int32 *)y)[i] = x[i]; + } + int32_sort_2power((int32 *) y, 2 * q, 0); + for (i = 0; i < n; ++i) { + x[i] = ((int32 *) y)[i]; + } + return; + } + + int32_sort_2power(x, q, 1); + PQCLEAN_MCELIECE460896F_AVX_int32_sort(x + q, n - q); + + while (q >= 64) { + q >>= 2; + j = int32_threestages(x, n, q); + minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j); + if (j + 4 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + j += 4 * q; + } + minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j); + if (j + 2 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + } + j += 2 * q; + } + minmax_vector(x + j, x + j + q, n - q - j); + q >>= 1; + } + if (q == 32) { + j = 0; + for (; j + 64 <= n; j += 64) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8 x4 = int32x8_load(&x[j + 32]); + int32x8 x5 = int32x8_load(&x[j + 40]); + int32x8 x6 = int32x8_load(&x[j + 48]); + int32x8 x7 = int32x8_load(&x[j + 56]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20); + int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31); + int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20); + int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8_MINMAX(a4, a5); + int32x8_MINMAX(a6, a7); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20); + int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31); + int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20); + int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8 c4 = _mm256_unpacklo_epi64(b4, b5); + int32x8 c5 = _mm256_unpackhi_epi64(b4, b5); + int32x8 c6 = _mm256_unpacklo_epi64(b6, b7); + int32x8 c7 = _mm256_unpackhi_epi64(b6, b7); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8_MINMAX(c4, c5); + int32x8_MINMAX(c6, c7); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 d4 = _mm256_unpacklo_epi32(c4, c5); + int32x8 d5 = _mm256_unpackhi_epi32(c4, c5); + int32x8 d6 = _mm256_unpacklo_epi32(c6, c7); + int32x8 d7 = _mm256_unpackhi_epi32(c6, c7); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi64(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi64(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi64(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi64(d6, d7); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8_MINMAX(e4, e5); + int32x8_MINMAX(e6, e7); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8 f4 = _mm256_unpacklo_epi32(e4, e5); + int32x8 f5 = _mm256_unpackhi_epi32(e4, e5); + int32x8 f6 = _mm256_unpacklo_epi32(e6, e7); + int32x8 f7 = _mm256_unpackhi_epi32(e6, e7); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + int32x8_store(&x[j + 32], f4); + int32x8_store(&x[j + 40], f5); + int32x8_store(&x[j + 48], f6); + int32x8_store(&x[j + 56], f7); + } + minmax_vector(x + j, x + j + 32, n - 32 - j); + goto continue16; + } + if (q == 16) { + j = 0; +continue16: + for (; j + 32 <= n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + } + minmax_vector(x + j, x + j + 16, n - 16 - j); + goto continue8; + } + /* q == 8 */ + j = 0; +continue8: + for (; j + 16 <= n; j += 16) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 8], x1); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */ + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */ + int32x8_MINMAX(a0, a1); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */ + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */ + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */ + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */ + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */ + int32x8_MINMAX(e0, e1); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */ + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */ + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + } + minmax_vector(x + j, x + j + 8, n - 8 - j); + if (j + 8 <= n) { + int32_MINMAX(&x[j], &x[j + 4]); + int32_MINMAX(&x[j + 1], &x[j + 5]); + int32_MINMAX(&x[j + 2], &x[j + 6]); + int32_MINMAX(&x[j + 3], &x[j + 7]); + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + int32_MINMAX(&x[j + 4], &x[j + 6]); + int32_MINMAX(&x[j + 5], &x[j + 7]); + int32_MINMAX(&x[j + 4], &x[j + 5]); + int32_MINMAX(&x[j + 6], &x[j + 7]); + j += 8; + } + minmax_vector(x + j, x + j + 4, n - 4 - j); + if (j + 4 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + j += 4; + } + if (j + 3 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + } + if (j + 2 <= n) { + int32_MINMAX(&x[j], &x[j + 1]); + } +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/int32_sort.h b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/int32_sort.h new file mode 100644 index 0000000000..ec7a3c0b6a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/int32_sort.h @@ -0,0 +1,9 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_INT32_SORT_H +#define PQCLEAN_MCELIECE460896F_AVX_INT32_SORT_H + +#include +#include + +void PQCLEAN_MCELIECE460896F_AVX_int32_sort(int32_t *x, size_t n); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/operations.c b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/operations.c new file mode 100644 index 0000000000..703e520600 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE460896F_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE460896F_AVX_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE460896F_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE460896F_AVX_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE460896F_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE460896F_AVX_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE460896F_AVX_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE460896F_AVX_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE460896F_AVX_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE460896F_AVX_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE460896F_AVX_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE460896F_AVX_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE460896F_AVX_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/params.h b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/params.h new file mode 100644 index 0000000000..d592319287 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_PARAMS_H +#define PQCLEAN_MCELIECE460896F_AVX_PARAMS_H + +#define GFBITS 13 +#define SYS_N 4608 +#define SYS_T 96 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/pk_gen.c b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/pk_gen.c new file mode 100644 index 0000000000..dcb0007b1a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/pk_gen.c @@ -0,0 +1,358 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void de_bitslicing(uint64_t *out, vec256 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 256 + 0 * 64 + r] <<= 1; + out[i * 256 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 256 + 1 * 64 + r] <<= 1; + out[i * 256 + 1 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i][j], 2); + for (r = 0; r < 64; r++) { + out[i * 256 + 2 * 64 + r] <<= 1; + out[i * 256 + 2 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(in[i][j], 3); + for (r = 0; r < 64; r++) { + out[i * 256 + 3 * 64 + r] <<= 1; + out[i * 256 + 3 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec256 out0[][GFBITS], vec256 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[4] = {0}; + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + } +} + +static void transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + return (int)_tzcnt_u64(in); +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint64_t mat[][ ((SYS_N + 255) / 256) * 4 ], uint32_t *perm) { + int i, j, k, s, block_idx, row; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 64; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = mat[ row + i ][ block_idx ]; + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask(k, ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = mat[ i + j ][ block_idx ]; + } + + transpose_64x64(buf, buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask(k, ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + transpose_64x64(buf, buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + mat[ i + j ][ block_idx ] = buf[j]; + } + } + + return 0; +} + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 255) / 256) +#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) +#define NBLOCKS2_I ((GFBITS * SYS_T + 255) / 256) +int PQCLEAN_MCELIECE460896F_AVX_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + int tail = (GFBITS * SYS_T) % 64; + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 4 ]; + + uint64_t mask; + + vec128 sk_int[ GFBITS ]; + + vec256 consts[ 32 ][ GFBITS ]; + vec256 eval[ 32 ][ GFBITS ]; + vec256 prod[ 32 ][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + + // compute the inverses + + PQCLEAN_MCELIECE460896F_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE460896F_AVX_fft(eval, sk_int); + + PQCLEAN_MCELIECE460896F_AVX_vec256_copy(prod[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE460896F_AVX_vec256_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE460896F_AVX_vec256_inv(tmp, prod[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE460896F_AVX_vec256_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE460896F_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE460896F_AVX_vec256_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE460896F_AVX_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE460896F_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE460896F_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + // gaussian elimination + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < row; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + + for (i = 0; i < GFBITS * SYS_T; i++) { + PQCLEAN_MCELIECE460896F_AVX_store_i(pk, mat[i][ NBLOCKS1_I - 1 ] >> tail, (64 - tail) / 8); + pk += (64 - tail) / 8; + + for (j = NBLOCKS1_I; j < NBLOCKS1_H; j++) { + PQCLEAN_MCELIECE460896F_AVX_store8(pk, mat[i][j]); + pk += 8; + } + } + + // + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/pk_gen.h b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/pk_gen.h new file mode 100644 index 0000000000..718b45ec85 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_PK_GEN_H +#define PQCLEAN_MCELIECE460896F_AVX_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE460896F_AVX_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/scalars_2x.inc b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/scalars_2x.inc new file mode 100644 index 0000000000..1783f9dbce --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/scalars_2x.inc @@ -0,0 +1,75 @@ +{ + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000) +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/scalars_4x.inc b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/scalars_4x.inc new file mode 100644 index 0000000000..2614718b01 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/scalars_4x.inc @@ -0,0 +1,91 @@ +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330, 0XF0F30C33CF03F03F, 0X3F30CC0C000F3FCC), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C, 0X00F30FC00C3300FF, 0XFC3CF030FC3FFF03), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC, 0XF3CC3CF3F3FCF33F, 0X33FFFCFF0CCF3CC3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30, 0X3C0FC0FC303C3F3C, 0X003CFF33C3CC30CF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC, 0XFC30CF303F3FF00F, 0XCFF3CF33C00F3003), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F, 0X33300C0CC3300CF3, 0X00F3CC0CF3003CCF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF, 0X3C030CF3F03FF3F3, 0X3C000CFCCC3C3333), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3, 0X3CCC03FCCC3FFC03, 0XF3CF03C0FCF03FF0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030, 0X033C3C3CF0003FC3, 0X3F3C3CF0C330330C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC, 0XFFC0FF00F0FF0F03, 0X33CCFCC0FF0033F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3, 0XF3F30CF003FCC303, 0X33C300C0F0C003F3), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF, 0X30CFCFC3CC0F3000, 0X003FF0003F00C00C), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33, 0X0CF30CCF3FCFCC0F, 0XCFF3C3033F030FFF) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00, 0X0F0F00FF0FF0FFFF, 0XFF0F0F00F000F0FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF, 0XF000F0F00F00FF0F, 0X0FFFFFFFFF00000F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF, 0X000FFFF0FFF0FF0F, 0XF0FFFF000F00F0FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F, 0X00F00FFF00000FF0, 0X0F0000F00FFF0FFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFF0000F0FF000F0, 0XFF000000FFF00000, 0XFFFFF0000FFFF00F, 0X0F0F0F00FF0F000F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0, 0XFFF0FFF0000FFFF0, 0X000F0F0FFFF0F000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F, 0XF0F0F0000F0F0F00, 0XF0FFFF0F00F0FF0F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000, 0X00F000F0F00FFF00, 0X0F0F000F0F00F0FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00F000F0FFF00F00, 0X00F00F00F00F000F, 0XF0FF0F0FFF00F0FF, 0X0000F0FF00FF0F0F), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0, 0XF0FF0FFFF0F0F0FF, 0X00FFFF0FF0FFF0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF, 0X00FFFFFFFFFFFFF0, 0X0000000F00F0FFF0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF, 0X00FFF0F0FF000F0F, 0XF0F00000FF00F0F0), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00, 0X000FFFF0000FFF00, 0X0F0F0FFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00FF0000000000FF, 0X00FF00FF00FF0000, 0XFFFF00FF00FF00FF, 0XFF0000FFFFFF00FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF, 0X00FFFF000000FF00, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FF00FF0000, 0X0000FFFF000000FF, 0XFFFF00FFFFFFFF00, 0XFFFF000000FFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF000000FF0000, 0X00FFFF00FF000000, 0X0000FFFF00FFFFFF, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF, 0X00FF0000FF0000FF, 0XFFFFFF00FFFFFF00), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00, 0XFFFF0000FF00FFFF, 0X00FFFF00FFFF00FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00, 0XFF000000FFFFFF00, 0X0000FFFF00FF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF, 0X000000000000FFFF, 0X000000FFFF000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00FFFFFFFF00FF00, 0X0000FF0000000000, 0XFF00FF00FFFF0000, 0XFF00FF0000FF00FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF000000000000, 0XFF00FFFF00FFFF00, 0XFFFF00FFFF00FFFF, 0X00FF0000000000FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00FF0000FF000000, 0X00FFFF00000000FF, 0XFFFFFFFFFF00FF00, 0XFF00FFFF00FF00FF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF, 0XFFFF00FFFF0000FF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000, 0X0000FF00000000FF, 0X0000FF000000FFFF) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X000000000000FFFF, 0X0000FFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000FFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF, 0XFFFFFFFF00000000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFF000000000000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000000000000000, 0X0000FFFF00000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000, 0X00000000FFFF0000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFF0000FFFF0000, 0X0000FFFFFFFFFFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0X0000FFFFFFFFFFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFFFFFF00000000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF, 0X000000000000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF000000000000, 0X00000000FFFF0000, 0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFFFFFF0000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFF0000FFFF, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFF0000FFFFFFFF, 0X0000FFFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000) +}, +{ + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF) +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/sk_gen.c b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/sk_gen.c new file mode 100644 index 0000000000..ce68f366cb --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE460896F_AVX_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE460896F_AVX_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE460896F_AVX_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE460896F_AVX_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE460896F_AVX_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE460896F_AVX_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE460896F_AVX_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE460896F_AVX_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/sk_gen.h b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/sk_gen.h new file mode 100644 index 0000000000..c80be9066e --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_SK_GEN_H +#define PQCLEAN_MCELIECE460896F_AVX_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE460896F_AVX_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE460896F_AVX_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/syndrome_asm.S b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/syndrome_asm.S new file mode 100644 index 0000000000..a65147ddce --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/syndrome_asm.S @@ -0,0 +1,650 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: reg256 pp + +# qhasm: reg256 ee + +# qhasm: reg256 ss + +# qhasm: int64 buf_ptr + +# qhasm: stack256 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE460896F_AVX_syndrome_asm +.global PQCLEAN_MCELIECE460896F_AVX_syndrome_asm +_PQCLEAN_MCELIECE460896F_AVX_syndrome_asm: +PQCLEAN_MCELIECE460896F_AVX_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 523740 +# asm 1: add $523740,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 1248 +# asm 1: mov $1248,>row=int64#5 +# asm 2: mov $1248,>row=%r8 +mov $1248,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 156 ] +# asm 1: vmovupd 156(ee=reg256#2 +# asm 2: vmovupd 156(ee=%ymm1 +vmovupd 156(%rdx),%ymm1 + +# qhasm: ss &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 32(pp=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 188 ] +# asm 1: vmovupd 188(ee=reg256#3 +# asm 2: vmovupd 188(ee=%ymm2 +vmovupd 188(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 64(pp=%ymm1 +vmovupd 64(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 220 ] +# asm 1: vmovupd 220(ee=reg256#3 +# asm 2: vmovupd 220(ee=%ymm2 +vmovupd 220(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 96(pp=%ymm1 +vmovupd 96(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 252 ] +# asm 1: vmovupd 252(ee=reg256#3 +# asm 2: vmovupd 252(ee=%ymm2 +vmovupd 252(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 128(pp=%ymm1 +vmovupd 128(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 284 ] +# asm 1: vmovupd 284(ee=reg256#3 +# asm 2: vmovupd 284(ee=%ymm2 +vmovupd 284(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 160(pp=%ymm1 +vmovupd 160(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 316 ] +# asm 1: vmovupd 316(ee=reg256#3 +# asm 2: vmovupd 316(ee=%ymm2 +vmovupd 316(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 192(pp=%ymm1 +vmovupd 192(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 348 ] +# asm 1: vmovupd 348(ee=reg256#3 +# asm 2: vmovupd 348(ee=%ymm2 +vmovupd 348(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 224(pp=%ymm1 +vmovupd 224(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 380 ] +# asm 1: vmovupd 380(ee=reg256#3 +# asm 2: vmovupd 380(ee=%ymm2 +vmovupd 380(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 256(pp=%ymm1 +vmovupd 256(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 412 ] +# asm 1: vmovupd 412(ee=reg256#3 +# asm 2: vmovupd 412(ee=%ymm2 +vmovupd 412(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 288(pp=%ymm1 +vmovupd 288(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 444 ] +# asm 1: vmovupd 444(ee=reg256#3 +# asm 2: vmovupd 444(ee=%ymm2 +vmovupd 444(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 320(pp=%ymm1 +vmovupd 320(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 476 ] +# asm 1: vmovupd 476(ee=reg256#3 +# asm 2: vmovupd 476(ee=%ymm2 +vmovupd 476(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 352(pp=%ymm1 +vmovupd 352(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 508 ] +# asm 1: vmovupd 508(ee=reg256#3 +# asm 2: vmovupd 508(ee=%ymm2 +vmovupd 508(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 384(pp=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 540 ] +# asm 1: vmovupd 540(ee=reg256#3 +# asm 2: vmovupd 540(ee=%ymm2 +vmovupd 540(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand buf=stack256#1 +# asm 2: vmovapd buf=0(%rsp) +vmovapd %ymm0,0(%rsp) + +# qhasm: s = *(uint32 *)(input_1 + 416) +# asm 1: movl 416(s=int64#6d +# asm 2: movl 416(s=%r9d +movl 416(%rsi),%r9d + +# qhasm: e = *(uint32 *)(input_2 + 572) +# asm 1: movl 572(e=int64#7d +# asm 2: movl 572(e=%eax +movl 572(%rdx),%eax + +# qhasm: s &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 16(b64=%rax +movq 16(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 24(b64=%rax +movq 24(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(ee=reg256#2 +# asm 2: vmovupd 0(ee=%ymm1 +vmovupd 0(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 32(ss=%ymm0 +vmovupd 32(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(ee=reg256#2 +# asm 2: vmovupd 32(ee=%ymm1 +vmovupd 32(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 64(ss=%ymm0 +vmovupd 64(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(ee=reg256#2 +# asm 2: vmovupd 64(ee=%ymm1 +vmovupd 64(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 96(ss=%ymm0 +vmovupd 96(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 96 ] +# asm 1: vmovupd 96(ee=reg256#2 +# asm 2: vmovupd 96(ee=%ymm1 +vmovupd 96(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor s=int64#2 +# asm 2: movq 128(s=%rsi +movq 128(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 128 ] +# asm 1: movq 128(e=int64#4 +# asm 2: movq 128(e=%rcx +movq 128(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movq 136(s=%rsi +movq 136(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 136 ] +# asm 1: movq 136(e=int64#4 +# asm 2: movq 136(e=%rcx +movq 136(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movq 144(s=%rsi +movq 144(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 144 ] +# asm 1: movq 144(e=int64#4 +# asm 2: movq 144(e=%rcx +movq 144(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2d +# asm 2: movl 152(s=%esi +movl 152(%rdi),%esi + +# qhasm: e = *(uint32 *)( input_2 + 152 ) +# asm 1: movl 152(e=int64#3d +# asm 2: movl 152(e=%edx +movl 152(%rdx),%edx + +# qhasm: s ^= e +# asm 1: xor mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE460896F_AVX_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE460896F_AVX_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE460896F_AVX_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE460896F_AVX_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE460896F_AVX_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE460896F_AVX_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE460896F_AVX_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE460896F_AVX_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE460896F_AVX_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE460896F_AVX_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE460896F_AVX_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE460896F_AVX_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE460896F_AVX_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE460896F_AVX_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE460896F_AVX_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE460896F_AVX_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE460896F_AVX_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE460896F_AVX_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE460896F_AVX_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE460896F_AVX_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE460896F_AVX_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE460896F_AVX_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE460896F_AVX_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK5_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK5_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE460896F_AVX_MASK5_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK5_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK5_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK5_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE460896F_AVX_MASK4_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK4_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK4_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK4_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE460896F_AVX_MASK4_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK4_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK4_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK4_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE460896F_AVX_MASK3_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK3_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK3_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK3_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE460896F_AVX_MASK3_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK3_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK3_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK3_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(x1=reg256#8 +# asm 2: vmovupd 256(x1=%ymm7 +vmovupd 256(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 512 ] +# asm 1: vmovupd 512(x2=reg256#9 +# asm 2: vmovupd 512(x2=%ymm8 +vmovupd 512(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 768 ] +# asm 1: vmovupd 768(x3=reg256#10 +# asm 2: vmovupd 768(x3=%ymm9 +vmovupd 768(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1024 ] +# asm 1: vmovupd 1024(x4=reg256#11 +# asm 2: vmovupd 1024(x4=%ymm10 +vmovupd 1024(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1280 ] +# asm 1: vmovupd 1280(x5=reg256#12 +# asm 2: vmovupd 1280(x5=%ymm11 +vmovupd 1280(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1536 ] +# asm 1: vmovupd 1536(x6=reg256#13 +# asm 2: vmovupd 1536(x6=%ymm12 +vmovupd 1536(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1792 ] +# asm 1: vmovupd 1792(x7=reg256#14 +# asm 2: vmovupd 1792(x7=%ymm13 +vmovupd 1792(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 32(x0=%ymm6 +vmovupd 32(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x2=reg256#9 +# asm 2: vmovupd 544(x2=%ymm8 +vmovupd 544(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x3=reg256#10 +# asm 2: vmovupd 800(x3=%ymm9 +vmovupd 800(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x4=reg256#11 +# asm 2: vmovupd 1056(x4=%ymm10 +vmovupd 1056(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x5=reg256#12 +# asm 2: vmovupd 1312(x5=%ymm11 +vmovupd 1312(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x6=reg256#13 +# asm 2: vmovupd 1568(x6=%ymm12 +vmovupd 1568(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x7=reg256#14 +# asm 2: vmovupd 1824(x7=%ymm13 +vmovupd 1824(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 32 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 64(x0=%ymm6 +vmovupd 64(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x1=reg256#8 +# asm 2: vmovupd 320(x1=%ymm7 +vmovupd 320(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x3=reg256#10 +# asm 2: vmovupd 832(x3=%ymm9 +vmovupd 832(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x4=reg256#11 +# asm 2: vmovupd 1088(x4=%ymm10 +vmovupd 1088(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x5=reg256#12 +# asm 2: vmovupd 1344(x5=%ymm11 +vmovupd 1344(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x6=reg256#13 +# asm 2: vmovupd 1600(x6=%ymm12 +vmovupd 1600(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x7=reg256#14 +# asm 2: vmovupd 1856(x7=%ymm13 +vmovupd 1856(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 64 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 96(x0=%ymm6 +vmovupd 96(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x1=reg256#8 +# asm 2: vmovupd 352(x1=%ymm7 +vmovupd 352(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x2=reg256#9 +# asm 2: vmovupd 608(x2=%ymm8 +vmovupd 608(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x4=reg256#11 +# asm 2: vmovupd 1120(x4=%ymm10 +vmovupd 1120(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x5=reg256#12 +# asm 2: vmovupd 1376(x5=%ymm11 +vmovupd 1376(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x6=reg256#13 +# asm 2: vmovupd 1632(x6=%ymm12 +vmovupd 1632(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x7=reg256#14 +# asm 2: vmovupd 1888(x7=%ymm13 +vmovupd 1888(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 96 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 128(x0=%ymm6 +vmovupd 128(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x1=reg256#8 +# asm 2: vmovupd 384(x1=%ymm7 +vmovupd 384(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x2=reg256#9 +# asm 2: vmovupd 640(x2=%ymm8 +vmovupd 640(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x3=reg256#10 +# asm 2: vmovupd 896(x3=%ymm9 +vmovupd 896(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x5=reg256#12 +# asm 2: vmovupd 1408(x5=%ymm11 +vmovupd 1408(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x6=reg256#13 +# asm 2: vmovupd 1664(x6=%ymm12 +vmovupd 1664(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x7=reg256#14 +# asm 2: vmovupd 1920(x7=%ymm13 +vmovupd 1920(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 128 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 160(x0=%ymm6 +vmovupd 160(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x1=reg256#8 +# asm 2: vmovupd 416(x1=%ymm7 +vmovupd 416(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x2=reg256#9 +# asm 2: vmovupd 672(x2=%ymm8 +vmovupd 672(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x3=reg256#10 +# asm 2: vmovupd 928(x3=%ymm9 +vmovupd 928(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x4=reg256#11 +# asm 2: vmovupd 1184(x4=%ymm10 +vmovupd 1184(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x6=reg256#13 +# asm 2: vmovupd 1696(x6=%ymm12 +vmovupd 1696(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x7=reg256#14 +# asm 2: vmovupd 1952(x7=%ymm13 +vmovupd 1952(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 160 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 192(x0=%ymm6 +vmovupd 192(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x1=reg256#8 +# asm 2: vmovupd 448(x1=%ymm7 +vmovupd 448(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x2=reg256#9 +# asm 2: vmovupd 704(x2=%ymm8 +vmovupd 704(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x3=reg256#10 +# asm 2: vmovupd 960(x3=%ymm9 +vmovupd 960(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x4=reg256#11 +# asm 2: vmovupd 1216(x4=%ymm10 +vmovupd 1216(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x5=reg256#12 +# asm 2: vmovupd 1472(x5=%ymm11 +vmovupd 1472(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x7=reg256#14 +# asm 2: vmovupd 1984(x7=%ymm13 +vmovupd 1984(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 192 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 224(x0=%ymm6 +vmovupd 224(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x1=reg256#8 +# asm 2: vmovupd 480(x1=%ymm7 +vmovupd 480(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x2=reg256#9 +# asm 2: vmovupd 736(x2=%ymm8 +vmovupd 736(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x3=reg256#10 +# asm 2: vmovupd 992(x3=%ymm9 +vmovupd 992(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x4=reg256#11 +# asm 2: vmovupd 1248(x4=%ymm10 +vmovupd 1248(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x5=reg256#12 +# asm 2: vmovupd 1504(x5=%ymm11 +vmovupd 1504(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x6=reg256#13 +# asm 2: vmovupd 1760(x6=%ymm12 +vmovupd 1760(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#1 +# asm 2: vpand v00=%ymm0 +vpand %ymm9,%ymm0,%ymm0 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#13 +# asm 2: vpsllq $32,v10=%ymm12 +vpsllq $32,%ymm13,%ymm12 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm0,%ymm12,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#13 +# asm 2: vpslld $16,v10=%ymm12 +vpslld $16,%ymm11,%ymm12 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#14 +# asm 2: vpsrld $16,v01=%ymm13 +vpsrld $16,%ymm14,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm0,%ymm13 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm8,%ymm13 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#3 +# asm 2: vpand v00=%ymm2 +vpand %ymm7,%ymm2,%ymm2 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#9 +# asm 2: vpslld $16,v10=%ymm8 +vpslld $16,%ymm1,%ymm8 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm2,%ymm8,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#8 +# asm 2: vpsllw $8,v10=%ymm7 +vpsllw $8,%ymm12,%ymm7 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#9 +# asm 2: vpsrlw $8,v01=%ymm8 +vpsrlw $8,%ymm9,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#10 +# asm 2: vpsllw $8,v10=%ymm9 +vpsllw $8,%ymm0,%ymm9 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#12 +# asm 2: vpsllw $8,v10=%ymm11 +vpsllw $8,%ymm2,%ymm11 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#11 +# asm 2: vpsrlw $8,v01=%ymm10 +vpsrlw $8,%ymm10,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#5 +# asm 2: vpand v00=%ymm4 +vpand %ymm6,%ymm4,%ymm4 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#11 +# asm 2: vpsllw $8,v10=%ymm10 +vpsllw $8,%ymm1,%ymm10 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm4,%ymm10,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 224 ] = x0 +# asm 1: vmovupd mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK2_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK2_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE460896F_AVX_MASK2_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK2_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK2_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK2_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE460896F_AVX_MASK1_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK1_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK1_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK1_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE460896F_AVX_MASK1_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK1_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK1_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK1_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE460896F_AVX_MASK0_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK0_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK0_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK0_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE460896F_AVX_MASK0_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK0_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK0_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE460896F_AVX_MASK0_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(x1=reg256#8 +# asm 2: vmovupd 32(x1=%ymm7 +vmovupd 32(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(x2=reg256#9 +# asm 2: vmovupd 64(x2=%ymm8 +vmovupd 64(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(x3=reg256#10 +# asm 2: vmovupd 96(x3=%ymm9 +vmovupd 96(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(x4=reg256#11 +# asm 2: vmovupd 128(x4=%ymm10 +vmovupd 128(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(x5=reg256#12 +# asm 2: vmovupd 160(x5=%ymm11 +vmovupd 160(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(x6=reg256#13 +# asm 2: vmovupd 192(x6=%ymm12 +vmovupd 192(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(x7=reg256#14 +# asm 2: vmovupd 224(x7=%ymm13 +vmovupd 224(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 256(x0=%ymm6 +vmovupd 256(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x2=reg256#9 +# asm 2: vmovupd 320(x2=%ymm8 +vmovupd 320(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x3=reg256#10 +# asm 2: vmovupd 352(x3=%ymm9 +vmovupd 352(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x4=reg256#11 +# asm 2: vmovupd 384(x4=%ymm10 +vmovupd 384(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x5=reg256#12 +# asm 2: vmovupd 416(x5=%ymm11 +vmovupd 416(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x6=reg256#13 +# asm 2: vmovupd 448(x6=%ymm12 +vmovupd 448(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x7=reg256#14 +# asm 2: vmovupd 480(x7=%ymm13 +vmovupd 480(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 256 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 512(x0=%ymm6 +vmovupd 512(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x1=reg256#8 +# asm 2: vmovupd 544(x1=%ymm7 +vmovupd 544(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x3=reg256#10 +# asm 2: vmovupd 608(x3=%ymm9 +vmovupd 608(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x4=reg256#11 +# asm 2: vmovupd 640(x4=%ymm10 +vmovupd 640(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x5=reg256#12 +# asm 2: vmovupd 672(x5=%ymm11 +vmovupd 672(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x6=reg256#13 +# asm 2: vmovupd 704(x6=%ymm12 +vmovupd 704(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x7=reg256#14 +# asm 2: vmovupd 736(x7=%ymm13 +vmovupd 736(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 512 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 768(x0=%ymm6 +vmovupd 768(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x1=reg256#8 +# asm 2: vmovupd 800(x1=%ymm7 +vmovupd 800(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x2=reg256#9 +# asm 2: vmovupd 832(x2=%ymm8 +vmovupd 832(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x4=reg256#11 +# asm 2: vmovupd 896(x4=%ymm10 +vmovupd 896(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x5=reg256#12 +# asm 2: vmovupd 928(x5=%ymm11 +vmovupd 928(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x6=reg256#13 +# asm 2: vmovupd 960(x6=%ymm12 +vmovupd 960(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x7=reg256#14 +# asm 2: vmovupd 992(x7=%ymm13 +vmovupd 992(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 768 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1024(x0=%ymm6 +vmovupd 1024(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x1=reg256#8 +# asm 2: vmovupd 1056(x1=%ymm7 +vmovupd 1056(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x2=reg256#9 +# asm 2: vmovupd 1088(x2=%ymm8 +vmovupd 1088(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x3=reg256#10 +# asm 2: vmovupd 1120(x3=%ymm9 +vmovupd 1120(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x5=reg256#12 +# asm 2: vmovupd 1184(x5=%ymm11 +vmovupd 1184(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x6=reg256#13 +# asm 2: vmovupd 1216(x6=%ymm12 +vmovupd 1216(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x7=reg256#14 +# asm 2: vmovupd 1248(x7=%ymm13 +vmovupd 1248(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1024 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1280(x0=%ymm6 +vmovupd 1280(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x1=reg256#8 +# asm 2: vmovupd 1312(x1=%ymm7 +vmovupd 1312(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x2=reg256#9 +# asm 2: vmovupd 1344(x2=%ymm8 +vmovupd 1344(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x3=reg256#10 +# asm 2: vmovupd 1376(x3=%ymm9 +vmovupd 1376(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x4=reg256#11 +# asm 2: vmovupd 1408(x4=%ymm10 +vmovupd 1408(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x6=reg256#13 +# asm 2: vmovupd 1472(x6=%ymm12 +vmovupd 1472(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x7=reg256#14 +# asm 2: vmovupd 1504(x7=%ymm13 +vmovupd 1504(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1280 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1536(x0=%ymm6 +vmovupd 1536(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x1=reg256#8 +# asm 2: vmovupd 1568(x1=%ymm7 +vmovupd 1568(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x2=reg256#9 +# asm 2: vmovupd 1600(x2=%ymm8 +vmovupd 1600(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x3=reg256#10 +# asm 2: vmovupd 1632(x3=%ymm9 +vmovupd 1632(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x4=reg256#11 +# asm 2: vmovupd 1664(x4=%ymm10 +vmovupd 1664(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x5=reg256#12 +# asm 2: vmovupd 1696(x5=%ymm11 +vmovupd 1696(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x7=reg256#14 +# asm 2: vmovupd 1760(x7=%ymm13 +vmovupd 1760(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1536 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1792(x0=%ymm6 +vmovupd 1792(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x1=reg256#8 +# asm 2: vmovupd 1824(x1=%ymm7 +vmovupd 1824(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x2=reg256#9 +# asm 2: vmovupd 1856(x2=%ymm8 +vmovupd 1856(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x3=reg256#10 +# asm 2: vmovupd 1888(x3=%ymm9 +vmovupd 1888(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x4=reg256#11 +# asm 2: vmovupd 1920(x4=%ymm10 +vmovupd 1920(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x5=reg256#12 +# asm 2: vmovupd 1952(x5=%ymm11 +vmovupd 1952(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x6=reg256#13 +# asm 2: vmovupd 1984(x6=%ymm12 +vmovupd 1984(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#1 +# asm 2: vpand v10=%ymm0 +vpand %ymm13,%ymm0,%ymm0 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm12,%ymm0,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#13 +# asm 2: vpand v10=%ymm12 +vpand %ymm11,%ymm2,%ymm12 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm14,%ymm3,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm0,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm8,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#3 +# asm 2: vpand v10=%ymm2 +vpand %ymm1,%ymm2,%ymm2 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm8,%ymm2,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#8 +# asm 2: vpand v10=%ymm7 +vpand %ymm12,%ymm4,%ymm7 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm9,%ymm5,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#10 +# asm 2: vpand v10=%ymm9 +vpand %ymm0,%ymm4,%ymm9 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#12 +# asm 2: vpand v10=%ymm11 +vpand %ymm2,%ymm4,%ymm11 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm4,%ymm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#5 +# asm 2: vpand v10=%ymm4 +vpand %ymm1,%ymm4,%ymm4 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm10,%ymm4,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 1792 ] = x0 +# asm 1: vmovupd +#include + +void PQCLEAN_MCELIECE460896F_AVX_uint32_sort(uint32_t *x, size_t n); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/update_asm.S b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/update_asm.S new file mode 100644 index 0000000000..b819b3a91a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/update_asm.S @@ -0,0 +1,576 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 s0 + +# qhasm: int64 s1 + +# qhasm: int64 s2 + +# qhasm: enter update_asm +.p2align 5 +.global _PQCLEAN_MCELIECE460896F_AVX_update_asm +.global PQCLEAN_MCELIECE460896F_AVX_update_asm +_PQCLEAN_MCELIECE460896F_AVX_update_asm: +PQCLEAN_MCELIECE460896F_AVX_update_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: s2 = input_1 +# asm 1: mov s2=int64#2 +# asm 2: mov s2=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1, + +void PQCLEAN_MCELIECE460896F_AVX_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE460896F_AVX_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE460896F_AVX_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE460896F_AVX_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE460896F_AVX_irr_load(vec128 *out, const unsigned char *in) { + int i, j; + uint64_t v[2]; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE460896F_AVX_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + v[0] = v[1] = 0; + + for (j = 63; j >= 0; j--) { + v[0] <<= 1; + v[0] |= (irr[j] >> i) & 1; + } + for (j = SYS_T; j >= 64; j--) { + v[1] <<= 1; + v[1] |= (irr[j] >> i) & 1; + } + + out[i] = PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(v[0], v[1]); + } +} + +void PQCLEAN_MCELIECE460896F_AVX_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE460896F_AVX_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +vec128 PQCLEAN_MCELIECE460896F_AVX_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE460896F_AVX_vec128_set2x( PQCLEAN_MCELIECE460896F_AVX_load8(in), PQCLEAN_MCELIECE460896F_AVX_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE460896F_AVX_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE460896F_AVX_store8(out + 0, PQCLEAN_MCELIECE460896F_AVX_vec128_extract(in, 0)); + PQCLEAN_MCELIECE460896F_AVX_store8(out + 8, PQCLEAN_MCELIECE460896F_AVX_vec128_extract(in, 1)); +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/util.h b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/util.h new file mode 100644 index 0000000000..54a5efd6be --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/util.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_UTIL_H +#define PQCLEAN_MCELIECE460896F_AVX_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE460896F_AVX_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE460896F_AVX_store2(unsigned char *dest, uint16_t a); +uint16_t PQCLEAN_MCELIECE460896F_AVX_load2(const unsigned char *src); +uint32_t PQCLEAN_MCELIECE460896F_AVX_load4(const unsigned char *src); +void PQCLEAN_MCELIECE460896F_AVX_irr_load(vec128 *out, const unsigned char *in); +void PQCLEAN_MCELIECE460896F_AVX_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE460896F_AVX_load8(const unsigned char *in); +vec128 PQCLEAN_MCELIECE460896F_AVX_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE460896F_AVX_store16(unsigned char *out, vec128 in); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/vec128.c b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/vec128.c new file mode 100644 index 0000000000..4cb57f5e40 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/vec128.c @@ -0,0 +1,79 @@ +#include "vec128.h" + +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE460896F_AVX_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE460896F_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE460896F_AVX_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE460896F_AVX_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE460896F_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE460896F_AVX_vec128_mul_asm(h, f, g, 16); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/vec128.h b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/vec128.h new file mode 100644 index 0000000000..6dd2c6cd85 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/vec128.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_MCELIECE460896F_AVX_VEC128_H +#define PQCLEAN_MCELIECE460896F_AVX_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE460896F_AVX_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE460896F_AVX_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE460896F_AVX_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE460896F_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE460896F_AVX_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE460896F_AVX_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE460896F_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/vec128_mul_asm.S b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/vec128_mul_asm.S new file mode 100644 index 0000000000..ad187dd33f --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/vec128_mul_asm.S @@ -0,0 +1,1816 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 b2 + +# qhasm: reg256 b3 + +# qhasm: reg256 b4 + +# qhasm: reg256 b5 + +# qhasm: reg256 b6 + +# qhasm: reg256 b7 + +# qhasm: reg256 b8 + +# qhasm: reg256 b9 + +# qhasm: reg256 b10 + +# qhasm: reg256 b11 + +# qhasm: reg256 b12 + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: reg128 h0 + +# qhasm: reg128 h1 + +# qhasm: reg128 h2 + +# qhasm: reg128 h3 + +# qhasm: reg128 h4 + +# qhasm: reg128 h5 + +# qhasm: reg128 h6 + +# qhasm: reg128 h7 + +# qhasm: reg128 h8 + +# qhasm: reg128 h9 + +# qhasm: reg128 h10 + +# qhasm: reg128 h11 + +# qhasm: reg128 h12 + +# qhasm: reg128 h13 + +# qhasm: reg128 h14 + +# qhasm: reg128 h15 + +# qhasm: reg128 h16 + +# qhasm: reg128 h17 + +# qhasm: reg128 h18 + +# qhasm: reg128 h19 + +# qhasm: reg128 h20 + +# qhasm: reg128 h21 + +# qhasm: reg128 h22 + +# qhasm: reg128 h23 + +# qhasm: reg128 h24 + +# qhasm: stack4864 buf + +# qhasm: int64 ptr + +# qhasm: int64 tmp + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE460896F_AVX_vec128_mul_asm +.global PQCLEAN_MCELIECE460896F_AVX_vec128_mul_asm +_PQCLEAN_MCELIECE460896F_AVX_vec128_mul_asm: +PQCLEAN_MCELIECE460896F_AVX_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $608,%r11 +sub %r11,%rsp + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#5 +# asm 2: leaq ptr=%r8 +leaq 0(%rsp),%r8 + +# qhasm: tmp = input_3 +# asm 1: mov tmp=int64#6 +# asm 2: mov tmp=%r9 +mov %rcx,%r9 + +# qhasm: tmp *= 12 +# asm 1: imulq $12,tmp=int64#6 +# asm 2: imulq $12,tmp=%r9 +imulq $12,%r9,%r9 + +# qhasm: input_2 += tmp +# asm 1: add b12=reg256#1 +# asm 2: vbroadcasti128 0(b12=%ymm0 +vbroadcasti128 0(%rdx), %ymm0 + +# qhasm: input_2 -= input_3 +# asm 1: sub a6=reg256#2 +# asm 2: vpxor a6=%ymm1 +vpxor %ymm1,%ymm1,%ymm1 + +# qhasm: a6[0] = mem128[ input_1 + 96 ] +# asm 1: vinsertf128 $0x0,96(r18=reg256#3 +# asm 2: vpand r18=%ymm2 +vpand %ymm0,%ymm1,%ymm2 + +# qhasm: mem256[ ptr + 576 ] = r18 +# asm 1: vmovupd r17=reg256#4 +# asm 2: vpand r17=%ymm3 +vpand %ymm0,%ymm2,%ymm3 + +# qhasm: a4[0] = mem128[ input_1 + 64 ] +# asm 1: vinsertf128 $0x0,64(r16=reg256#6 +# asm 2: vpand r16=%ymm5 +vpand %ymm0,%ymm4,%ymm5 + +# qhasm: a3[0] = mem128[ input_1 + 48 ] +# asm 1: vinsertf128 $0x0,48(r15=reg256#8 +# asm 2: vpand r15=%ymm7 +vpand %ymm0,%ymm6,%ymm7 + +# qhasm: a2[0] = mem128[ input_1 + 32 ] +# asm 1: vinsertf128 $0x0,32(r14=reg256#10 +# asm 2: vpand r14=%ymm9 +vpand %ymm0,%ymm8,%ymm9 + +# qhasm: a1[0] = mem128[ input_1 + 16 ] +# asm 1: vinsertf128 $0x0,16(r13=reg256#12 +# asm 2: vpand r13=%ymm11 +vpand %ymm0,%ymm10,%ymm11 + +# qhasm: a0[0] = mem128[ input_1 + 0 ] +# asm 1: vinsertf128 $0x0,0(r12=reg256#1 +# asm 2: vpand r12=%ymm0 +vpand %ymm0,%ymm12,%ymm0 + +# qhasm: b11 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b11=reg256#14 +# asm 2: vbroadcasti128 0(b11=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r12 ^= r +# asm 1: vpxor r11=reg256#4 +# asm 2: vpand r11=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b10 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b10=reg256#14 +# asm 2: vbroadcasti128 0(b10=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r11 ^= r +# asm 1: vpxor r10=reg256#6 +# asm 2: vpand r10=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b9 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b9=reg256#14 +# asm 2: vbroadcasti128 0(b9=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r10 ^= r +# asm 1: vpxor r9=reg256#8 +# asm 2: vpand r9=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b8 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b8=reg256#14 +# asm 2: vbroadcasti128 0(b8=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r9 ^= r +# asm 1: vpxor r8=reg256#10 +# asm 2: vpand r8=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b7 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b7=reg256#14 +# asm 2: vbroadcasti128 0(b7=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r8 ^= r +# asm 1: vpxor r7=reg256#12 +# asm 2: vpand r7=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b6 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b6=reg256#14 +# asm 2: vbroadcasti128 0(b6=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r6=reg256#1 +# asm 2: vpand r6=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: b5 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b5=reg256#14 +# asm 2: vbroadcasti128 0(b5=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r6 ^= r +# asm 1: vpxor r5=reg256#4 +# asm 2: vpand r5=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b4 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b4=reg256#14 +# asm 2: vbroadcasti128 0(b4=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#6 +# asm 2: vpand r4=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b3 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b3=reg256#14 +# asm 2: vbroadcasti128 0(b3=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#8 +# asm 2: vpand r3=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b2 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b2=reg256#14 +# asm 2: vbroadcasti128 0(b2=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#10 +# asm 2: vpand r2=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b1 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b1=reg256#14 +# asm 2: vbroadcasti128 0(b1=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#12 +# asm 2: vpand r1=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b0 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b0=reg256#14 +# asm 2: vbroadcasti128 0(b0=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#1 +# asm 2: vpand r0=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: mem256[ ptr + 160 ] = r5 +# asm 1: vmovupd h24=reg128#1 +# asm 2: movdqu 560(h24=%xmm0 +movdqu 560(%r8),%xmm0 + +# qhasm: h11 = h24 +# asm 1: movdqa h11=reg128#2 +# asm 2: movdqa h11=%xmm1 +movdqa %xmm0,%xmm1 + +# qhasm: h12 = h24 +# asm 1: movdqa h12=reg128#3 +# asm 2: movdqa h12=%xmm2 +movdqa %xmm0,%xmm2 + +# qhasm: h14 = h24 +# asm 1: movdqa h14=reg128#4 +# asm 2: movdqa h14=%xmm3 +movdqa %xmm0,%xmm3 + +# qhasm: h15 = h24 +# asm 1: movdqa h15=reg128#1 +# asm 2: movdqa h15=%xmm0 +movdqa %xmm0,%xmm0 + +# qhasm: h23 = mem128[ ptr + 528 ] +# asm 1: movdqu 528(h23=reg128#5 +# asm 2: movdqu 528(h23=%xmm4 +movdqu 528(%r8),%xmm4 + +# qhasm: h10 = h23 +# asm 1: movdqa h10=reg128#6 +# asm 2: movdqa h10=%xmm5 +movdqa %xmm4,%xmm5 + +# qhasm: h11 = h11 ^ h23 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm4,%xmm1,%xmm1 + +# qhasm: h13 = h23 +# asm 1: movdqa h13=reg128#7 +# asm 2: movdqa h13=%xmm6 +movdqa %xmm4,%xmm6 + +# qhasm: h14 = h14 ^ h23 +# asm 1: vpxor h14=reg128#4 +# asm 2: vpxor h14=%xmm3 +vpxor %xmm4,%xmm3,%xmm3 + +# qhasm: h22 = mem128[ ptr + 496 ] +# asm 1: movdqu 496(h22=reg128#5 +# asm 2: movdqu 496(h22=%xmm4 +movdqu 496(%r8),%xmm4 + +# qhasm: h9 = h22 +# asm 1: movdqa h9=reg128#8 +# asm 2: movdqa h9=%xmm7 +movdqa %xmm4,%xmm7 + +# qhasm: h10 = h10 ^ h22 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm4,%xmm5,%xmm5 + +# qhasm: h12 = h12 ^ h22 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm4,%xmm2,%xmm2 + +# qhasm: h13 = h13 ^ h22 +# asm 1: vpxor h13=reg128#5 +# asm 2: vpxor h13=%xmm4 +vpxor %xmm4,%xmm6,%xmm4 + +# qhasm: h21 = mem128[ ptr + 464 ] +# asm 1: movdqu 464(h21=reg128#7 +# asm 2: movdqu 464(h21=%xmm6 +movdqu 464(%r8),%xmm6 + +# qhasm: h8 = h21 +# asm 1: movdqa h8=reg128#9 +# asm 2: movdqa h8=%xmm8 +movdqa %xmm6,%xmm8 + +# qhasm: h9 = h9 ^ h21 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h11 = h11 ^ h21 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h12 = h12 ^ h21 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm6,%xmm2,%xmm2 + +# qhasm: h20 = mem128[ ptr + 432 ] +# asm 1: movdqu 432(h20=reg128#7 +# asm 2: movdqu 432(h20=%xmm6 +movdqu 432(%r8),%xmm6 + +# qhasm: h7 = h20 +# asm 1: movdqa h7=reg128#10 +# asm 2: movdqa h7=%xmm9 +movdqa %xmm6,%xmm9 + +# qhasm: h8 = h8 ^ h20 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h10 = h10 ^ h20 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h11 = h11 ^ h20 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h19 = mem128[ ptr + 400 ] +# asm 1: movdqu 400(h19=reg128#7 +# asm 2: movdqu 400(h19=%xmm6 +movdqu 400(%r8),%xmm6 + +# qhasm: h6 = h19 +# asm 1: movdqa h6=reg128#11 +# asm 2: movdqa h6=%xmm10 +movdqa %xmm6,%xmm10 + +# qhasm: h7 = h7 ^ h19 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm6,%xmm9,%xmm9 + +# qhasm: h9 = h9 ^ h19 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h10 = h10 ^ h19 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h18 = mem128[ ptr + 368 ] +# asm 1: movdqu 368(h18=reg128#7 +# asm 2: movdqu 368(h18=%xmm6 +movdqu 368(%r8),%xmm6 + +# qhasm: h18 = h18 ^ mem128[ ptr + 576 ] +# asm 1: vpxor 576(h18=reg128#7 +# asm 2: vpxor 576(h18=%xmm6 +vpxor 576(%r8),%xmm6,%xmm6 + +# qhasm: h5 = h18 +# asm 1: movdqa h5=reg128#12 +# asm 2: movdqa h5=%xmm11 +movdqa %xmm6,%xmm11 + +# qhasm: h6 = h6 ^ h18 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm6,%xmm10,%xmm10 + +# qhasm: h8 = h8 ^ h18 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h9 = h9 ^ h18 +# asm 1: vpxor h9=reg128#7 +# asm 2: vpxor h9=%xmm6 +vpxor %xmm6,%xmm7,%xmm6 + +# qhasm: h17 = mem128[ ptr + 336 ] +# asm 1: movdqu 336(h17=reg128#8 +# asm 2: movdqu 336(h17=%xmm7 +movdqu 336(%r8),%xmm7 + +# qhasm: h17 = h17 ^ mem128[ ptr + 544 ] +# asm 1: vpxor 544(h17=reg128#8 +# asm 2: vpxor 544(h17=%xmm7 +vpxor 544(%r8),%xmm7,%xmm7 + +# qhasm: h4 = h17 +# asm 1: movdqa h4=reg128#13 +# asm 2: movdqa h4=%xmm12 +movdqa %xmm7,%xmm12 + +# qhasm: h5 = h5 ^ h17 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm7,%xmm11,%xmm11 + +# qhasm: h7 = h7 ^ h17 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm7,%xmm9,%xmm9 + +# qhasm: h8 = h8 ^ h17 +# asm 1: vpxor h8=reg128#8 +# asm 2: vpxor h8=%xmm7 +vpxor %xmm7,%xmm8,%xmm7 + +# qhasm: h16 = mem128[ ptr + 304 ] +# asm 1: movdqu 304(h16=reg128#9 +# asm 2: movdqu 304(h16=%xmm8 +movdqu 304(%r8),%xmm8 + +# qhasm: h16 = h16 ^ mem128[ ptr + 512 ] +# asm 1: vpxor 512(h16=reg128#9 +# asm 2: vpxor 512(h16=%xmm8 +vpxor 512(%r8),%xmm8,%xmm8 + +# qhasm: h3 = h16 +# asm 1: movdqa h3=reg128#14 +# asm 2: movdqa h3=%xmm13 +movdqa %xmm8,%xmm13 + +# qhasm: h4 = h4 ^ h16 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm8,%xmm12,%xmm12 + +# qhasm: h6 = h6 ^ h16 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm8,%xmm10,%xmm10 + +# qhasm: h7 = h7 ^ h16 +# asm 1: vpxor h7=reg128#9 +# asm 2: vpxor h7=%xmm8 +vpxor %xmm8,%xmm9,%xmm8 + +# qhasm: h15 = h15 ^ mem128[ ptr + 272 ] +# asm 1: vpxor 272(h15=reg128#1 +# asm 2: vpxor 272(h15=%xmm0 +vpxor 272(%r8),%xmm0,%xmm0 + +# qhasm: h15 = h15 ^ mem128[ ptr + 480 ] +# asm 1: vpxor 480(h15=reg128#1 +# asm 2: vpxor 480(h15=%xmm0 +vpxor 480(%r8),%xmm0,%xmm0 + +# qhasm: h2 = h15 +# asm 1: movdqa h2=reg128#10 +# asm 2: movdqa h2=%xmm9 +movdqa %xmm0,%xmm9 + +# qhasm: h3 = h3 ^ h15 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm0,%xmm13,%xmm13 + +# qhasm: h5 = h5 ^ h15 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm0,%xmm11,%xmm11 + +# qhasm: h6 = h6 ^ h15 +# asm 1: vpxor h6=reg128#1 +# asm 2: vpxor h6=%xmm0 +vpxor %xmm0,%xmm10,%xmm0 + +# qhasm: h14 = h14 ^ mem128[ ptr + 240 ] +# asm 1: vpxor 240(h14=reg128#4 +# asm 2: vpxor 240(h14=%xmm3 +vpxor 240(%r8),%xmm3,%xmm3 + +# qhasm: h14 = h14 ^ mem128[ ptr + 448 ] +# asm 1: vpxor 448(h14=reg128#4 +# asm 2: vpxor 448(h14=%xmm3 +vpxor 448(%r8),%xmm3,%xmm3 + +# qhasm: h1 = h14 +# asm 1: movdqa h1=reg128#11 +# asm 2: movdqa h1=%xmm10 +movdqa %xmm3,%xmm10 + +# qhasm: h2 = h2 ^ h14 +# asm 1: vpxor h2=reg128#10 +# asm 2: vpxor h2=%xmm9 +vpxor %xmm3,%xmm9,%xmm9 + +# qhasm: h4 = h4 ^ h14 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm3,%xmm12,%xmm12 + +# qhasm: h5 = h5 ^ h14 +# asm 1: vpxor h5=reg128#4 +# asm 2: vpxor h5=%xmm3 +vpxor %xmm3,%xmm11,%xmm3 + +# qhasm: h13 = h13 ^ mem128[ ptr + 208 ] +# asm 1: vpxor 208(h13=reg128#5 +# asm 2: vpxor 208(h13=%xmm4 +vpxor 208(%r8),%xmm4,%xmm4 + +# qhasm: h13 = h13 ^ mem128[ ptr + 416 ] +# asm 1: vpxor 416(h13=reg128#5 +# asm 2: vpxor 416(h13=%xmm4 +vpxor 416(%r8),%xmm4,%xmm4 + +# qhasm: h0 = h13 +# asm 1: movdqa h0=reg128#12 +# asm 2: movdqa h0=%xmm11 +movdqa %xmm4,%xmm11 + +# qhasm: h1 = h1 ^ h13 +# asm 1: vpxor h1=reg128#11 +# asm 2: vpxor h1=%xmm10 +vpxor %xmm4,%xmm10,%xmm10 + +# qhasm: h3 = h3 ^ h13 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm4,%xmm13,%xmm13 + +# qhasm: h4 = h4 ^ h13 +# asm 1: vpxor h4=reg128#5 +# asm 2: vpxor h4=%xmm4 +vpxor %xmm4,%xmm12,%xmm4 + +# qhasm: h12 = h12 ^ mem128[ ptr + 384 ] +# asm 1: vpxor 384(h12=reg128#3 +# asm 2: vpxor 384(h12=%xmm2 +vpxor 384(%r8),%xmm2,%xmm2 + +# qhasm: h12 = h12 ^ mem128[ ptr + 176 ] +# asm 1: vpxor 176(h12=reg128#3 +# asm 2: vpxor 176(h12=%xmm2 +vpxor 176(%r8),%xmm2,%xmm2 + +# qhasm: mem128[ input_0 + 192 ] = h12 +# asm 1: movdqu h11=reg128#2 +# asm 2: vpxor 352(h11=%xmm1 +vpxor 352(%r8),%xmm1,%xmm1 + +# qhasm: h11 = h11 ^ mem128[ ptr + 144 ] +# asm 1: vpxor 144(h11=reg128#2 +# asm 2: vpxor 144(h11=%xmm1 +vpxor 144(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 176 ] = h11 +# asm 1: movdqu h10=reg128#2 +# asm 2: vpxor 320(h10=%xmm1 +vpxor 320(%r8),%xmm5,%xmm1 + +# qhasm: h10 = h10 ^ mem128[ ptr + 112 ] +# asm 1: vpxor 112(h10=reg128#2 +# asm 2: vpxor 112(h10=%xmm1 +vpxor 112(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 160 ] = h10 +# asm 1: movdqu h9=reg128#2 +# asm 2: vpxor 288(h9=%xmm1 +vpxor 288(%r8),%xmm6,%xmm1 + +# qhasm: h9 = h9 ^ mem128[ ptr + 80 ] +# asm 1: vpxor 80(h9=reg128#2 +# asm 2: vpxor 80(h9=%xmm1 +vpxor 80(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 144 ] = h9 +# asm 1: movdqu h8=reg128#2 +# asm 2: vpxor 256(h8=%xmm1 +vpxor 256(%r8),%xmm7,%xmm1 + +# qhasm: h8 = h8 ^ mem128[ ptr + 48 ] +# asm 1: vpxor 48(h8=reg128#2 +# asm 2: vpxor 48(h8=%xmm1 +vpxor 48(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 128 ] = h8 +# asm 1: movdqu h7=reg128#2 +# asm 2: vpxor 224(h7=%xmm1 +vpxor 224(%r8),%xmm8,%xmm1 + +# qhasm: h7 = h7 ^ mem128[ ptr + 16 ] +# asm 1: vpxor 16(h7=reg128#2 +# asm 2: vpxor 16(h7=%xmm1 +vpxor 16(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = h7 +# asm 1: movdqu h6=reg128#1 +# asm 2: vpxor 192(h6=%xmm0 +vpxor 192(%r8),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 96 ] = h6 +# asm 1: movdqu h5=reg128#1 +# asm 2: vpxor 160(h5=%xmm0 +vpxor 160(%r8),%xmm3,%xmm0 + +# qhasm: mem128[ input_0 + 80 ] = h5 +# asm 1: movdqu h4=reg128#1 +# asm 2: vpxor 128(h4=%xmm0 +vpxor 128(%r8),%xmm4,%xmm0 + +# qhasm: mem128[ input_0 + 64 ] = h4 +# asm 1: movdqu h3=reg128#1 +# asm 2: vpxor 96(h3=%xmm0 +vpxor 96(%r8),%xmm13,%xmm0 + +# qhasm: mem128[ input_0 + 48 ] = h3 +# asm 1: movdqu h2=reg128#1 +# asm 2: vpxor 64(h2=%xmm0 +vpxor 64(%r8),%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 32 ] = h2 +# asm 1: movdqu h1=reg128#1 +# asm 2: vpxor 32(h1=%xmm0 +vpxor 32(%r8),%xmm10,%xmm0 + +# qhasm: mem128[ input_0 + 16 ] = h1 +# asm 1: movdqu h0=reg128#1 +# asm 2: vpxor 0(h0=%xmm0 +vpxor 0(%r8),%xmm11,%xmm0 + +# qhasm: mem128[ input_0 + 0 ] = h0 +# asm 1: movdqu + +typedef __m256i vec256; + +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_set1_16b(uint16_t a); +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_setzero(void); +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_set4x(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); + +// Extract requires a literal argument so need to be macros +#define PQCLEAN_MCELIECE460896F_AVX_vec256_extract2x(a,i) ((vec128) _mm256_extractf128_si256((vec256) (a),(i))) +#define PQCLEAN_MCELIECE460896F_AVX_vec256_extract(a,i) ((uint64_t) _mm256_extract_epi64((vec256) (a),(i))) + +int PQCLEAN_MCELIECE460896F_AVX_vec256_testz(vec256 a); +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_and(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_xor(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_or(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_sll_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_srl_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_low(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_high(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_low_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_unpack_high_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE460896F_AVX_vec256_or_reduce(const vec256 *a); +void PQCLEAN_MCELIECE460896F_AVX_vec256_copy(vec256 *dest, const vec256 *src); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE460896F_AVX_vec256_mul(vec256 *h, vec256 *f, const vec256 *g); +void PQCLEAN_MCELIECE460896F_AVX_vec256_sq(vec256 * /*out*/, const vec256 * /*in*/); +void PQCLEAN_MCELIECE460896F_AVX_vec256_inv(vec256 * /*out*/, const vec256 * /*in*/); + +extern void PQCLEAN_MCELIECE460896F_AVX_vec256_maa_asm(vec256 *, vec256 *, const vec256 *); +extern void PQCLEAN_MCELIECE460896F_AVX_vec256_ama_asm(vec256 *, vec256 *, const vec256 *); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/vec256_ama_asm.S b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/vec256_ama_asm.S new file mode 100644 index 0000000000..1b9a7070cf --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece460896f_avx/vec256_ama_asm.S @@ -0,0 +1,2322 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 a7 + +# qhasm: reg256 a8 + +# qhasm: reg256 a9 + +# qhasm: reg256 a10 + +# qhasm: reg256 a11 + +# qhasm: reg256 a12 + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: enter vec256_ama_asm +.p2align 5 +.global _PQCLEAN_MCELIECE460896F_AVX_vec256_ama_asm +.global PQCLEAN_MCELIECE460896F_AVX_vec256_ama_asm +_PQCLEAN_MCELIECE460896F_AVX_vec256_ama_asm: +PQCLEAN_MCELIECE460896F_AVX_vec256_ama_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rdi),%ymm1 + +# qhasm: a12 = a12 ^ mem256[ input_1 + 384 ] +# asm 1: vpxor 384(a12=reg256#2 +# asm 2: vpxor 384(a12=%ymm1 +vpxor 384(%rsi),%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 384 ] = a12 +# asm 1: vmovupd r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rdi),%ymm14 + +# qhasm: a11 = a11 ^ mem256[ input_1 + 352 ] +# asm 1: vpxor 352(a11=reg256#15 +# asm 2: vpxor 352(a11=%ymm14 +vpxor 352(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 352 ] = a11 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rdi),%ymm14 + +# qhasm: a10 = a10 ^ mem256[ input_1 + 320 ] +# asm 1: vpxor 320(a10=reg256#15 +# asm 2: vpxor 320(a10=%ymm14 +vpxor 320(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 320 ] = a10 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rdi),%ymm14 + +# qhasm: a9 = a9 ^ mem256[ input_1 + 288 ] +# asm 1: vpxor 288(a9=reg256#15 +# asm 2: vpxor 288(a9=%ymm14 +vpxor 288(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 288 ] = a9 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rdi),%ymm14 + +# qhasm: a8 = a8 ^ mem256[ input_1 + 256 ] +# asm 1: vpxor 256(a8=reg256#15 +# asm 2: vpxor 256(a8=%ymm14 +vpxor 256(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 256 ] = a8 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rdi),%ymm14 + +# qhasm: a7 = a7 ^ mem256[ input_1 + 224 ] +# asm 1: vpxor 224(a7=reg256#15 +# asm 2: vpxor 224(a7=%ymm14 +vpxor 224(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 224 ] = a7 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rdi),%ymm14 + +# qhasm: a6 = a6 ^ mem256[ input_1 + 192 ] +# asm 1: vpxor 192(a6=reg256#15 +# asm 2: vpxor 192(a6=%ymm14 +vpxor 192(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 192 ] = a6 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rdi),%ymm14 + +# qhasm: a5 = a5 ^ mem256[ input_1 + 160 ] +# asm 1: vpxor 160(a5=reg256#15 +# asm 2: vpxor 160(a5=%ymm14 +vpxor 160(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 160 ] = a5 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rdi),%ymm14 + +# qhasm: a4 = a4 ^ mem256[ input_1 + 128 ] +# asm 1: vpxor 128(a4=reg256#15 +# asm 2: vpxor 128(a4=%ymm14 +vpxor 128(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 128 ] = a4 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rdi),%ymm14 + +# qhasm: a3 = a3 ^ mem256[ input_1 + 96 ] +# asm 1: vpxor 96(a3=reg256#15 +# asm 2: vpxor 96(a3=%ymm14 +vpxor 96(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 96 ] = a3 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rdi),%ymm14 + +# qhasm: a2 = a2 ^ mem256[ input_1 + 64 ] +# asm 1: vpxor 64(a2=reg256#15 +# asm 2: vpxor 64(a2=%ymm14 +vpxor 64(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 64 ] = a2 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rdi),%ymm14 + +# qhasm: a1 = a1 ^ mem256[ input_1 + 32 ] +# asm 1: vpxor 32(a1=reg256#15 +# asm 2: vpxor 32(a1=%ymm14 +vpxor 32(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 32 ] = a1 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rdi),%ymm14 + +# qhasm: a0 = a0 ^ mem256[ input_1 + 0 ] +# asm 1: vpxor 0(a0=reg256#15 +# asm 2: vpxor 0(a0=%ymm14 +vpxor 0(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 0 ] = a0 +# asm 1: vmovupd r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm2,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm1,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm13,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm12,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm11,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm10,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm9,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm8,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm7,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm6,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm5,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm4,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm3,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rdi),%ymm2,%ymm0 + +# qhasm: mem256[ input_0 + 384 ] = r12 +# asm 1: vmovupd r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rdi),%ymm1,%ymm0 + +# qhasm: mem256[ input_0 + 352 ] = r11 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rdi),%ymm13,%ymm0 + +# qhasm: mem256[ input_0 + 320 ] = r10 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rdi),%ymm12,%ymm0 + +# qhasm: mem256[ input_0 + 288 ] = r9 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rdi),%ymm11,%ymm0 + +# qhasm: mem256[ input_0 + 256 ] = r8 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rdi),%ymm10,%ymm0 + +# qhasm: mem256[ input_0 + 224 ] = r7 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rdi),%ymm9,%ymm0 + +# qhasm: mem256[ input_0 + 192 ] = r6 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rdi),%ymm8,%ymm0 + +# qhasm: mem256[ input_0 + 160 ] = r5 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rdi),%ymm7,%ymm0 + +# qhasm: mem256[ input_0 + 128 ] = r4 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rdi),%ymm6,%ymm0 + +# qhasm: mem256[ input_0 + 96 ] = r3 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rdi),%ymm5,%ymm0 + +# qhasm: mem256[ input_0 + 64 ] = r2 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rdi),%ymm4,%ymm0 + +# qhasm: mem256[ input_0 + 32 ] = r1 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rdi),%ymm3,%ymm0 + +# qhasm: mem256[ input_0 + 0 ] = r0 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t0 = mem64[ input_0 + 192 ] +# asm 1: movq 192(t0=int64#2 +# asm 2: movq 192(t0=%rsi +movq 192(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(t1=int64#3 +# asm 2: movq 200(t1=%rdx +movq 200(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 176(t0=%rsi +movq 176(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(t1=int64#3 +# asm 2: movq 184(t1=%rdx +movq 184(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 160(t0=%rsi +movq 160(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(t1=int64#3 +# asm 2: movq 168(t1=%rdx +movq 168(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 144(t0=%rsi +movq 144(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(t1=int64#3 +# asm 2: movq 152(t1=%rdx +movq 152(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 128(t0=%rsi +movq 128(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(t1=int64#3 +# asm 2: movq 136(t1=%rdx +movq 136(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 112(t0=%rsi +movq 112(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(t1=int64#3 +# asm 2: movq 120(t1=%rdx +movq 120(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 96(t0=%rsi +movq 96(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(t1=int64#3 +# asm 2: movq 104(t1=%rdx +movq 104(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 80(t0=%rsi +movq 80(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(t1=int64#3 +# asm 2: movq 88(t1=%rdx +movq 88(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 64(t0=%rsi +movq 64(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(t1=int64#3 +# asm 2: movq 72(t1=%rdx +movq 72(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 48(t0=%rsi +movq 48(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(t1=int64#3 +# asm 2: movq 56(t1=%rdx +movq 56(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 32(t0=%rsi +movq 32(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(t1=int64#3 +# asm 2: movq 40(t1=%rdx +movq 40(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 16(t0=%rsi +movq 16(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(t1=int64#3 +# asm 2: movq 24(t1=%rdx +movq 24(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 0(t0=%rsi +movq 0(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(t1=int64#1 +# asm 2: movq 8(t1=%rdi +movq 8(%rdi),%rdi + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rsi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE6688128_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/api.h b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/api.h new file mode 100644 index 0000000000..51213f9d48 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_API_H +#define PQCLEAN_MCELIECE6688128_AVX_API_H + +#include + +#define PQCLEAN_MCELIECE6688128_AVX_CRYPTO_ALGNAME "Classic McEliece 6688128" +#define PQCLEAN_MCELIECE6688128_AVX_CRYPTO_PUBLICKEYBYTES 1044992 +#define PQCLEAN_MCELIECE6688128_AVX_CRYPTO_SECRETKEYBYTES 13892 +#define PQCLEAN_MCELIECE6688128_AVX_CRYPTO_CIPHERTEXTBYTES 240 +#define PQCLEAN_MCELIECE6688128_AVX_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE6688128_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE6688128_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE6688128_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/benes.c b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/benes.c new file mode 100644 index 0000000000..937cfce112 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/benes.c @@ -0,0 +1,311 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_x(vec128 *data, vec128 *bits) { + int i; + vec128 v0, v1; + vec128 d; + + for (i = 0; i < 64; i += 2) { + v0 = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_low(data[i + 0], data[i + 1]); + v1 = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_high(data[i + 0], data[i + 1]); + + d = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(v0, v1); + d = PQCLEAN_MCELIECE6688128_AVX_vec128_and(d, *bits++); + v0 = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(v0, d); + v1 = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(v1, d); + + data[i + 0] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_low(v0, v1); + data[i + 1] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_high(v0, v1); + } +} + +static void layer_0(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x ], bs[ x + 1 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, *cond++); + bs[ x ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x ], diff); + bs[ x + 1 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 1 ], diff); + } +} + +static void layer_1(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 2 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 2 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 3 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 3 ], diff); + + cond += 2; + } +} + +static void layer_2(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 4 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 4 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 5 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 5 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[2]); + bs[ x + 2 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 2 ], diff); + bs[ x + 6 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 6 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[3]); + bs[ x + 3 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 3 ], diff); + bs[ x + 7 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ x + 7 ], diff); + + cond += 4; + } +} + +static void layer_3(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 8 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 8 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 9 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 9 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 10 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 10 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 11 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 11 ], diff); + + cond += 4; + } + } +} + +static void layer_4(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 16 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 16 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 17 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 17 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 18 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 18 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 19 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 19 ], diff); + + cond += 4; + } + } +} + +static void layer_5(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 32 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 32 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 33 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 33 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 34 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 34 ], diff); + + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); + diff = PQCLEAN_MCELIECE6688128_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 35 ] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(bs[ s + 35 ], diff); + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: bits_int, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE6688128_AVX_load_bits(vec128 bits_int[][32], const unsigned char *bits) { + int i, j; + const unsigned char *ptr = bits; + + vec128 buf[64]; + + for (i = 0; i <= 5; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(PQCLEAN_MCELIECE6688128_AVX_load8(ptr), PQCLEAN_MCELIECE6688128_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE6688128_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } + + for (i = 6; i <= 18; i++) { + for (j = 0; j < 32; j++) { + bits_int[i][j] = PQCLEAN_MCELIECE6688128_AVX_load16(ptr); + ptr += 16; + } + } + + for (i = 19; i < 25; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(PQCLEAN_MCELIECE6688128_AVX_load8(ptr), PQCLEAN_MCELIECE6688128_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE6688128_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } +} + +/* input: r, sequence of bits to be permuted */ +/* b, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE6688128_AVX_benes(vec128 *r, vec128 b[][32], int rev) { + int inc; + + vec128 *b_ptr; + + if (rev == 0) { + inc = 32; + b_ptr = b[ 0]; + } else { + inc = -32; + b_ptr = b[24]; + } + + // + + PQCLEAN_MCELIECE6688128_AVX_transpose_64x128_sp( r ); + + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE6688128_AVX_transpose_64x128_sp( r ); + + layer_x(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_x(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE6688128_AVX_transpose_64x128_sp( r ); + + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + //b_ptr += inc; + + PQCLEAN_MCELIECE6688128_AVX_transpose_64x128_sp( r ); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/benes.h b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/benes.h new file mode 100644 index 0000000000..9836478b89 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_BENES_H +#define PQCLEAN_MCELIECE6688128_AVX_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE6688128_AVX_load_bits(vec128 /*bits_int*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE6688128_AVX_benes(vec128 * /*r*/, vec128 /*b*/[][32], int /*rev*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/bm.c b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/bm.c new file mode 100644 index 0000000000..e06a27bed0 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/bm.c @@ -0,0 +1,219 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "params.h" +#include "vec128.h" + +#include + +extern gf PQCLEAN_MCELIECE6688128_AVX_vec_reduce_asm(vec128 *); +extern void PQCLEAN_MCELIECE6688128_AVX_update_asm(void *, gf, int); + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline void vec128_cmov(vec128 out[][2], uint16_t mask) { + int i; + + vec128 v0, v1; + + vec128 m0 = PQCLEAN_MCELIECE6688128_AVX_vec128_set1_16b( mask); + vec128 m1 = PQCLEAN_MCELIECE6688128_AVX_vec128_set1_16b(~mask); + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE6688128_AVX_vec128_and(out[i][1], m0); + v1 = PQCLEAN_MCELIECE6688128_AVX_vec128_and(out[i][0], m1); + out[i][0] = PQCLEAN_MCELIECE6688128_AVX_vec128_or(v0, v1); + } +} + +static inline void interleave(vec256 *in, int idx0, int idx1, vec256 *mask, int b) { + int s = 1 << b; + + vec256 x, y; + + x = PQCLEAN_MCELIECE6688128_AVX_vec256_or(PQCLEAN_MCELIECE6688128_AVX_vec256_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE6688128_AVX_vec256_sll_4x(PQCLEAN_MCELIECE6688128_AVX_vec256_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE6688128_AVX_vec256_or(PQCLEAN_MCELIECE6688128_AVX_vec256_srl_4x(PQCLEAN_MCELIECE6688128_AVX_vec256_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE6688128_AVX_vec256_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec256 *in) { + int i, k; + + vec256 mask[4][2]; + vec256 buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE6688128_AVX_vec256_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128_AVX_vec256_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128_AVX_vec256_extract(buf[i], 1) >> (k * 16)) & GFMASK; + out[ (4 * 2 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128_AVX_vec256_extract(buf[i], 2) >> (k * 16)) & GFMASK; + out[ (4 * 3 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128_AVX_vec256_extract(buf[i], 3) >> (k * 16)) & GFMASK; + } + } +} + +typedef union { + vec128 as_128[GFBITS][2]; + vec256 as_256[GFBITS]; +} aligned_double_vec128; + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE6688128_AVX_bm(vec128 *out, vec256 *in) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1, t; + + vec128 prod[ GFBITS ]; + vec128 interval[GFBITS]; + + aligned_double_vec128 db; + aligned_double_vec128 BC_tmp; + aligned_double_vec128 BC; + + gf d, b, c0 = 1; + gf coefs[256]; + + // initialization + + get_coefs(coefs, in); + + BC.as_128[0][0] = PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0, one << 63); + BC.as_128[0][1] = PQCLEAN_MCELIECE6688128_AVX_vec128_setzero(); + + for (i = 1; i < GFBITS; i++) { + BC.as_128[i][0] = BC.as_128[i][1] = PQCLEAN_MCELIECE6688128_AVX_vec128_setzero(); + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[i] = PQCLEAN_MCELIECE6688128_AVX_vec128_setzero(); + } + + for (N = 0; N < 256; N++) { + PQCLEAN_MCELIECE6688128_AVX_vec128_mul_asm(prod, interval, BC.as_128[0] + 1, 32); + PQCLEAN_MCELIECE6688128_AVX_update_asm(interval, coefs[N], 16); + + d = PQCLEAN_MCELIECE6688128_AVX_vec_reduce_asm(prod); + + t = PQCLEAN_MCELIECE6688128_AVX_gf_mul2(c0, coefs[N], b); + d ^= t & 0xFFFFFFFF; + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + db.as_128[i][0] = PQCLEAN_MCELIECE6688128_AVX_vec128_setbits((d >> i) & 1); + db.as_128[i][1] = PQCLEAN_MCELIECE6688128_AVX_vec128_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE6688128_AVX_vec256_mul(BC_tmp.as_256, db.as_256, BC.as_256); + + vec128_cmov(BC.as_128, mask); + PQCLEAN_MCELIECE6688128_AVX_update_asm(BC.as_128, c0 & mask, 32); + + for (i = 0; i < GFBITS; i++) { + BC.as_128[i][1] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(BC_tmp.as_128[i][0], BC_tmp.as_128[i][1]); + } + + c0 = t >> 32; + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + c0 = PQCLEAN_MCELIECE6688128_AVX_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + prod[i] = PQCLEAN_MCELIECE6688128_AVX_vec128_setbits((c0 >> i) & 1); + } + + PQCLEAN_MCELIECE6688128_AVX_vec128_mul_asm(out, prod, BC.as_128[0] + 1, 32); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/bm.h b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/bm.h new file mode 100644 index 0000000000..fc28b9a9e8 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/bm.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_BM_H +#define PQCLEAN_MCELIECE6688128_AVX_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "vec128.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE6688128_AVX_bm(vec128 * /*out*/, vec256 * /*in*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/consts.S b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/consts.S new file mode 100644 index 0000000000..74691b8ceb --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/consts.S @@ -0,0 +1,33 @@ +.data + +# not supported on MacOS +#.section .rodata + +.globl PQCLEAN_MCELIECE6688128_AVX_MASK0_0 +.globl PQCLEAN_MCELIECE6688128_AVX_MASK0_1 +.globl PQCLEAN_MCELIECE6688128_AVX_MASK1_0 +.globl PQCLEAN_MCELIECE6688128_AVX_MASK1_1 +.globl PQCLEAN_MCELIECE6688128_AVX_MASK2_0 +.globl PQCLEAN_MCELIECE6688128_AVX_MASK2_1 +.globl PQCLEAN_MCELIECE6688128_AVX_MASK3_0 +.globl PQCLEAN_MCELIECE6688128_AVX_MASK3_1 +.globl PQCLEAN_MCELIECE6688128_AVX_MASK4_0 +.globl PQCLEAN_MCELIECE6688128_AVX_MASK4_1 +.globl PQCLEAN_MCELIECE6688128_AVX_MASK5_0 +.globl PQCLEAN_MCELIECE6688128_AVX_MASK5_1 + +.p2align 5 + +PQCLEAN_MCELIECE6688128_AVX_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE6688128_AVX_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE6688128_AVX_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333, 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE6688128_AVX_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE6688128_AVX_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE6688128_AVX_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE6688128_AVX_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE6688128_AVX_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE6688128_AVX_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE6688128_AVX_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE6688128_AVX_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE6688128_AVX_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/consts.inc b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/consts.inc new file mode 100644 index 0000000000..72f800bd20 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/consts.inc @@ -0,0 +1,502 @@ +//64 +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6666666666666666, 0X6666666666666666, 0X6666666666666666, 0X6666666666666666), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A) +}, +//128 +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33, 0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF) +}, +//256 +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0XF00F0FF00FF0F00F, 0XF00F0FF00FF0F00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9669966969966996, 0X6996699696699669, 0X6996699696699669, 0X9669966969966996), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0X5AA5A55AA55A5AA5, 0X5AA5A55AA55A5AA5, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555) +}, +//512 +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9966669966999966, 0X6699996699666699, 0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9966996699669966, 0X6699669966996699, 0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6699996699666699, 0X9966669966999966, 0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6699669966996699, 0X9966996699669966, 0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +//1024 +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +//2048 +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +//4096 +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/controlbits.c b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/controlbits.c new file mode 100644 index 0000000000..4151542bc8 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE6688128_AVX_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE6688128_AVX_sort_63b(n / 2, x); + PQCLEAN_MCELIECE6688128_AVX_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE6688128_AVX_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/controlbits.h b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/controlbits.h new file mode 100644 index 0000000000..4e225dafc5 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_CONTROLBITS_H +#define PQCLEAN_MCELIECE6688128_AVX_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE6688128_AVX_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE6688128_AVX_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/crypto_hash.h b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/crypto_hash.h new file mode 100644 index 0000000000..ff4d88bc19 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE6688128_AVX_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/decrypt.c b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/decrypt.c new file mode 100644 index 0000000000..4bdb61a312 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/decrypt.c @@ -0,0 +1,234 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec256 out[][GFBITS], vec256 inv[][GFBITS], const unsigned char *sk, vec256 *recv) { + int i, j; + + vec128 sk_int[ GFBITS ]; + vec256 eval[32][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + // computing inverses + + PQCLEAN_MCELIECE6688128_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE6688128_AVX_fft(eval, sk_int); + + for (i = 0; i < 32; i++) { + PQCLEAN_MCELIECE6688128_AVX_vec256_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE6688128_AVX_vec256_copy(inv[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE6688128_AVX_vec256_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6688128_AVX_vec256_inv(tmp, inv[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE6688128_AVX_vec256_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE6688128_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6688128_AVX_vec256_copy(inv[0], tmp); + + // + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE6688128_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + unsigned char r[ 1024 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 1024; i++) { + r[i] = 0; + } + + for (i = 0; i < 64; i++) { + recv[i] = PQCLEAN_MCELIECE6688128_AVX_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 64; i++) { + v[0] = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE6688128_AVX_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE6688128_AVX_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec256 out[][GFBITS], vec256 inv[][GFBITS], vec256 *recv) { + int i, j; + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE6688128_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 64; i++) { + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE6688128_AVX_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE6688128_AVX_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u64( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec256 *s0, vec256 *s1) { + int i; + vec256 diff; + + diff = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(s0[0], s1[0]); + + for (i = 1; i < GFBITS; i++) { + diff = PQCLEAN_MCELIECE6688128_AVX_vec256_or(diff, PQCLEAN_MCELIECE6688128_AVX_vec256_xor(s0[i], s1[i])); + } + + return (uint16_t)PQCLEAN_MCELIECE6688128_AVX_vec256_testz(diff); +} + +static void reformat_128to256(vec256 *out, vec128 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(in[2 * i + 0], 0); + v[1] = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(in[2 * i + 0], 1); + v[2] = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(in[2 * i + 1], 0); + v[3] = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(in[2 * i + 1], 1); + + out[i] = PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +static void reformat_256to128(vec128 *out, vec256 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i], 3); + + out[2 * i + 0] = PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(v[0], v[1]); + out[2 * i + 1] = PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(v[2], v[3]); + } +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE6688128_AVX_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec256 inv[ 32 ][ GFBITS ]; + vec256 scaled[ 32 ][ GFBITS ]; + vec256 eval[32][ GFBITS ]; + + vec128 error128[ 64 ]; + vec256 error256[ 32 ]; + + vec256 s_priv[ GFBITS ]; + vec256 s_priv_cmp[ GFBITS ]; + vec128 locator[ GFBITS ]; + + vec128 recv128[ 64 ]; + vec256 recv256[ 32 ]; + vec256 allone; + + vec128 bits_int[25][32]; + + // Berlekamp decoder + + preprocess(recv128, c); + + PQCLEAN_MCELIECE6688128_AVX_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE6688128_AVX_benes(recv128, bits_int, 1); + + reformat_128to256(recv256, recv128); + + scaling(scaled, inv, sk, recv256); + PQCLEAN_MCELIECE6688128_AVX_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE6688128_AVX_bm(locator, s_priv); + + PQCLEAN_MCELIECE6688128_AVX_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(0xFFFF); + + for (i = 0; i < 32; i++) { + error256[i] = PQCLEAN_MCELIECE6688128_AVX_vec256_or_reduce(eval[i]); + error256[i] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(error256[i], allone); + } + + scaling_inv(scaled, inv, error256); + PQCLEAN_MCELIECE6688128_AVX_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + reformat_256to128(error128, error256); + PQCLEAN_MCELIECE6688128_AVX_benes(error128, bits_int, 0); + + postprocess(e, error128); + + check_weight = weight_check(e, error128); + + return 1 - (check_synd & check_weight); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/decrypt.h b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/decrypt.h new file mode 100644 index 0000000000..79cf19fa3a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_DECRYPT_H +#define PQCLEAN_MCELIECE6688128_AVX_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE6688128_AVX_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/encrypt.c b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/encrypt.c new file mode 100644 index 0000000000..4ba2d831f0 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/encrypt.c @@ -0,0 +1,104 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "int32_sort.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE6688128_AVX_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + int i, j, eq, count; + + uint16_t ind[ SYS_T * 2 ]; + int32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((unsigned char *) ind, sizeof(ind)); + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + PQCLEAN_MCELIECE6688128_AVX_int32_sort(ind32, SYS_T); + + eq = 0; + for (i = 1; i < SYS_T; i++) { + if (ind32[i - 1] == ind32[i]) { + eq = 1; + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE6688128_AVX_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE6688128_AVX_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + + PQCLEAN_MCELIECE6688128_AVX_syndrome_asm(s, pk, e); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/encrypt.h b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/encrypt.h new file mode 100644 index 0000000000..85f14f6310 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_ENCRYPT_H +#define PQCLEAN_MCELIECE6688128_AVX_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE6688128_AVX_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/fft.c b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/fft.c new file mode 100644 index 0000000000..821976b603 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/fft.c @@ -0,0 +1,275 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" +#include "vec256.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec128 *in) { + int i, j, k; + vec128 t; + uint64_t v0, v1; + + const vec128 mask[5][2] = { + { + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0x8888888888888888, 0x8888888888888888), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + } + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + v1 = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(in[i], 1); + v1 ^= v1 >> 32; + v0 = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(in[i], 0); + v0 ^= v1 << 32; + in[i] = PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(v0, v1); + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + t = PQCLEAN_MCELIECE6688128_AVX_vec128_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE6688128_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(in[i], t); + + t = PQCLEAN_MCELIECE6688128_AVX_vec128_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE6688128_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(in[i], t); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE6688128_AVX_vec128_mul(in, in, s[j]); // scaling + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec256 out[][ GFBITS ], vec128 *in) { + int i, j, k, s, b; + + vec128 tmp[ GFBITS ]; + vec256 tmp0[ GFBITS ]; + vec256 tmp1[ GFBITS ]; + vec128 t[ GFBITS ]; + + union { + vec128 v[8][ GFBITS + 1 ]; + vec256 V[8][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + uint64_t v0, v1; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + const vec256 powers[ 32 ][ GFBITS ] = { +#include "powers.inc" + }; + + uint64_t consts_ptr = 2; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; + + // boradcast + + for (j = 0; j < GFBITS; j++) { + t[j] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_high(in[j], in[j]); + } + + for (i = 0; i < 8; i += 2) { + for (j = 0; j < GFBITS; j++) { + v0 = (beta[i + 0] >> j) & 1; + v0 = -v0; + v1 = (beta[i + 1] >> j) & 1; + v1 = -v1; + + tmp[j] = PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(v0, v1); + } + + PQCLEAN_MCELIECE6688128_AVX_vec128_mul(tmp, t, tmp); + + for (j = 0; j < GFBITS; j++) { + pre.v[i + 0][j] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_low(tmp[j], tmp[j]); + pre.v[i + 1][j] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_high(tmp[j], tmp[j]); + } + } + + for (i = 0; i < GFBITS; i += 2) { + if (i != GFBITS - 1) { + buf.v[0][1] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_low(in[i + 1], in[i + 1] ^ pre.v[6][i + 1]); + } + buf.v[0][0] = PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_low(in[i + 0], in[i + 0] ^ pre.v[6][i + 0]); + + buf.V[1] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[0], pre.V[0][i / 2]); + buf.V[16] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[0], pre.V[4][i / 2]); + buf.V[3] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[1], pre.V[1][i / 2]); + buf.V[48] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[16], pre.V[5][i / 2]); + buf.V[49] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[48], pre.V[0][i / 2]); + buf.V[2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[0], pre.V[1][i / 2]); + buf.V[51] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[49], pre.V[1][i / 2]); + buf.V[6] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[2], pre.V[2][i / 2]); + buf.V[50] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[51], pre.V[0][i / 2]); + buf.V[7] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[6], pre.V[0][i / 2]); + buf.V[54] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[50], pre.V[2][i / 2]); + buf.V[5] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[7], pre.V[1][i / 2]); + buf.V[55] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[54], pre.V[0][i / 2]); + buf.V[53] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[55], pre.V[1][i / 2]); + buf.V[4] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[0], pre.V[2][i / 2]); + buf.V[52] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[53], pre.V[0][i / 2]); + buf.V[12] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[4], pre.V[3][i / 2]); + buf.V[60] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[52], pre.V[3][i / 2]); + buf.V[13] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[12], pre.V[0][i / 2]); + buf.V[61] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[60], pre.V[0][i / 2]); + buf.V[15] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[13], pre.V[1][i / 2]); + buf.V[63] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[61], pre.V[1][i / 2]); + buf.V[14] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[15], pre.V[0][i / 2]); + buf.V[62] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[63], pre.V[0][i / 2]); + buf.V[10] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[14], pre.V[2][i / 2]); + buf.V[58] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[62], pre.V[2][i / 2]); + buf.V[11] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[10], pre.V[0][i / 2]); + buf.V[59] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[58], pre.V[0][i / 2]); + buf.V[9] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[11], pre.V[1][i / 2]); + buf.V[57] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[59], pre.V[1][i / 2]); + buf.V[56] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[57], pre.V[0][i / 2]); + buf.V[8] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[0], pre.V[3][i / 2]); + buf.V[40] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[56], pre.V[4][i / 2]); + buf.V[24] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[8], pre.V[4][i / 2]); + buf.V[41] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[40], pre.V[0][i / 2]); + buf.V[25] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[24], pre.V[0][i / 2]); + buf.V[43] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[41], pre.V[1][i / 2]); + buf.V[27] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[25], pre.V[1][i / 2]); + buf.V[42] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[43], pre.V[0][i / 2]); + buf.V[26] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[27], pre.V[0][i / 2]); + buf.V[46] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[42], pre.V[2][i / 2]); + buf.V[30] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[26], pre.V[2][i / 2]); + buf.V[47] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[46], pre.V[0][i / 2]); + buf.V[31] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[30], pre.V[0][i / 2]); + buf.V[45] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[47], pre.V[1][i / 2]); + buf.V[29] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[31], pre.V[1][i / 2]); + buf.V[44] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[45], pre.V[0][i / 2]); + buf.V[28] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[29], pre.V[0][i / 2]); + buf.V[36] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[44], pre.V[3][i / 2]); + buf.V[20] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[28], pre.V[3][i / 2]); + buf.V[37] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[36], pre.V[0][i / 2]); + buf.V[21] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[20], pre.V[0][i / 2]); + buf.V[39] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[37], pre.V[1][i / 2]); + buf.V[23] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[21], pre.V[1][i / 2]); + buf.V[38] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[39], pre.V[0][i / 2]); + buf.V[22] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[23], pre.V[0][i / 2]); + buf.V[34] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[38], pre.V[2][i / 2]); + buf.V[18] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[22], pre.V[2][i / 2]); + buf.V[35] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[34], pre.V[0][i / 2]); + buf.V[19] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[18], pre.V[0][i / 2]); + buf.V[33] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[35], pre.V[1][i / 2]); + buf.V[17] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[19], pre.V[1][i / 2]); + buf.V[32] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[33], pre.V[0][i / 2]); + + + // transpose + + PQCLEAN_MCELIECE6688128_AVX_transpose_64x256_sp(buf.V); + + for (j = 0; j < 32; j++) { + if (i != GFBITS - 1) { + out[j][i + 1] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_high(buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + out[j][i + 0] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_low (buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + } + + // butterflies + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + tmp0[b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_low (out[k][b], out[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + tmp1[b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_high (out[k][b], out[k + 1][b]); + } + + PQCLEAN_MCELIECE6688128_AVX_vec256_maa_asm(tmp0, tmp1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_low (tmp0[b], tmp1[b]); + } + for (b = 0; b < GFBITS; b++) { + out[k + 1][b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_high (tmp0[b], tmp1[b]); + } + } + + for (i = 0; i <= 4; i++) { + s = 1 << i; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE6688128_AVX_vec256_maa_asm(out[k], out[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + consts_ptr += (1 << i); + } + + // adding the part contributed by x^128 + + for (i = 0; i < 32; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(out[i][b], powers[i][b]); + } + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE6688128_AVX_fft(vec256 out[][GFBITS], vec128 *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/fft.h b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/fft.h new file mode 100644 index 0000000000..bde52a784a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/fft.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_FFT_H +#define PQCLEAN_MCELIECE6688128_AVX_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include "params.h" +#include "vec128.h" +#include "vec256.h" +#include + +void PQCLEAN_MCELIECE6688128_AVX_fft(vec256 /*out*/[][GFBITS], vec128 * /*in*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/fft_tr.c b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/fft_tr.c new file mode 100644 index 0000000000..d4387000cd --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/fft_tr.c @@ -0,0 +1,379 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec256 *in) { + int i, j, k; + vec256 t; + uint64_t v[4]; + + const vec256 mask[6][2] = { + { + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0x2222222222222222, 0x2222222222222222, 0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0x4444444444444444, 0x4444444444444444, 0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0x3030303030303030, 0x3030303030303030, 0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000) + }, + { + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF) + } + }; + + const vec256 s[6][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE6688128_AVX_vec256_mul(in, in, s[j]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + t = PQCLEAN_MCELIECE6688128_AVX_vec256_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE6688128_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(in[i], t); + + t = PQCLEAN_MCELIECE6688128_AVX_vec256_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE6688128_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(in[i], t); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i], 3); + + v[1] ^= v[0] >> 32; + v[1] ^= v[1] << 32; + v[3] ^= v[2] >> 32; + v[3] ^= v[3] << 32; + + in[i] = PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i], 3); + + v[2] ^= v[1]; + v[3] ^= v[2]; + + in[i] = PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + + } +} + +static void butterflies_tr(vec256 *out, vec256 in[][ GFBITS ]) { + int i, j, k, s, b; + + vec256 t0[ GFBITS ]; + vec256 t1[ GFBITS ]; + vec256 t; + + vec128 out128[ GFBITS ][ 2 ]; + vec128 tmp[ GFBITS ]; + + union { + vec128 v[6][ GFBITS + 1 ]; + vec256 V[6][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t v[4]; + uint64_t consts_ptr = 33; + + const unsigned char reversal[] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // butterflies + + for (i = 4; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE6688128_AVX_vec256_ama_asm(in[k], in[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + } + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_low(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_high(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE6688128_AVX_vec256_ama_asm(t0, t1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + in[k][b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_low(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_high(t0[b], t1[b]); + } + + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_low_2x(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_high_2x(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE6688128_AVX_vec256_ama_asm(t0, t1, consts[0]); + + for (b = 0; b < GFBITS; b++) { + in[k + 0][b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_low_2x(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_high_2x(t0[b], t1[b]); + } + } + + + // boradcast + + for (i = 0; i < GFBITS; i += 2) { + // transpose + + for (k = 0; k < 32; k++) { + if (i != GFBITS - 1) { + buf.v[ reversal[2 * k + 0] ][1] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract2x(in[ k ][i + 1], 0); + buf.v[ reversal[2 * k + 1] ][1] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract2x(in[ k ][i + 1], 1); + } + + buf.v[ reversal[2 * k + 0] ][0] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract2x(in[ k ][i + 0], 0); + buf.v[ reversal[2 * k + 1] ][0] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract2x(in[ k ][i + 0], 1); + } + + PQCLEAN_MCELIECE6688128_AVX_transpose_64x256_sp(buf.V); + + // + + pre.V[0][i / 2] = buf.V[32]; + buf.V[33] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[33], buf.V[32]); + pre.V[1][i / 2] = buf.V[33]; + buf.V[35] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[35], buf.V[33]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[35]); + buf.V[34] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[34], buf.V[35]); + pre.V[2][i / 2] = buf.V[34]; + buf.V[38] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[38], buf.V[34]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[38]); + buf.V[39] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[39], buf.V[38]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[39]); + buf.V[37] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[37], buf.V[39]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[37]); + buf.V[36] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[36], buf.V[37]); + pre.V[3][i / 2] = buf.V[36]; + buf.V[44] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[44], buf.V[36]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[44]); + buf.V[45] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[45], buf.V[44]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[45]); + buf.V[47] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[47], buf.V[45]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[47]); + buf.V[46] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[46], buf.V[47]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[46]); + buf.V[42] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[42], buf.V[46]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[42]); + buf.V[43] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[43], buf.V[42]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[43]); + buf.V[41] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[41], buf.V[43]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[41]); + buf.V[40] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[40], buf.V[41]); + pre.V[4][i / 2] = buf.V[40]; + buf.V[56] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[56], buf.V[40]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[56]); + buf.V[57] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[57], buf.V[56]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[57]); + buf.V[59] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[59], buf.V[57]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[59]); + buf.V[58] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[58], buf.V[59]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[58]); + buf.V[62] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[62], buf.V[58]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[62]); + buf.V[63] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[63], buf.V[62]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[63]); + buf.V[61] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[61], buf.V[63]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[61]); + buf.V[60] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[60], buf.V[61]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[3][i / 2], buf.V[60]); + buf.V[52] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[52], buf.V[60]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[52]); + buf.V[53] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[53], buf.V[52]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[53]); + buf.V[55] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[55], buf.V[53]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[55]); + buf.V[54] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[54], buf.V[55]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[54]); + buf.V[50] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[50], buf.V[54]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[50]); + buf.V[51] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[51], buf.V[50]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[51]); + buf.V[49] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[49], buf.V[51]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[49]); + buf.V[48] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[48], buf.V[49]); + pre.V[5][i / 2] = buf.V[48]; + buf.V[16] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[16], buf.V[48]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[16]); + buf.V[17] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[17], buf.V[16]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[17]); + buf.V[19] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[19], buf.V[17]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[19]); + buf.V[18] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[18], buf.V[19]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[18]); + buf.V[22] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[22], buf.V[18]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[22]); + buf.V[23] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[23], buf.V[22]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[23]); + buf.V[21] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[21], buf.V[23]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[21]); + buf.V[20] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[20], buf.V[21]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[3][i / 2], buf.V[20]); + buf.V[28] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[28], buf.V[20]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[28]); + buf.V[29] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[29], buf.V[28]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[29]); + buf.V[31] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[31], buf.V[29]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[31]); + buf.V[30] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[30], buf.V[31]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[30]); + buf.V[26] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[26], buf.V[30]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[26]); + buf.V[27] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[27], buf.V[26]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[27]); + buf.V[25] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[25], buf.V[27]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[25]); + buf.V[24] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[24], buf.V[25]); + pre.V[4][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[4][i / 2], buf.V[24]); + buf.V[8] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[8], buf.V[24]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[8]); + buf.V[9] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[9], buf.V[8]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[9]); + buf.V[11] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[11], buf.V[9]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[11]); + buf.V[10] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[10], buf.V[11]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[10]); + buf.V[14] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[14], buf.V[10]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[14]); + buf.V[15] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[15], buf.V[14]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[15]); + buf.V[13] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[13], buf.V[15]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[13]); + buf.V[12] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[12], buf.V[13]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[3][i / 2], buf.V[12]); + buf.V[4] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[4], buf.V[12]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[4]); + buf.V[5] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[5], buf.V[4]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[5]); + buf.V[7] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[7], buf.V[5]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[7]); + buf.V[6] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[6], buf.V[7]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[6]); + buf.V[2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[2], buf.V[6]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[2]); + buf.V[3] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[3], buf.V[2]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[3]); + buf.V[1] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[1], buf.V[3]); + + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[1]); + t = PQCLEAN_MCELIECE6688128_AVX_vec256_xor(buf.V[0], buf.V[1]); + + if (i != GFBITS - 1) { + out128[i + 1][0] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract2x(t, 1); + } + out128[i + 0][0] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract2x(t, 0); + + } + + // + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6688128_AVX_vec128_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE6688128_AVX_vec128_mul(tmp, pre.v[0], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = tmp[b]; + } + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6688128_AVX_vec128_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE6688128_AVX_vec128_mul(tmp, pre.v[i], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(out128[b][1], tmp[b]); + } + } + + for (b = 0; b < GFBITS; b++) { + v[0] = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(out128[b][0], 0); + v[1] = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(out128[b][0], 1); + v[2] = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(out128[b][1], 0); + v[3] = PQCLEAN_MCELIECE6688128_AVX_vec128_extract(out128[b][1], 1); + + out[b] = PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +void PQCLEAN_MCELIECE6688128_AVX_fft_tr(vec256 *out, vec256 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/fft_tr.h b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/fft_tr.h new file mode 100644 index 0000000000..2943d39673 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_FFT_TR_H +#define PQCLEAN_MCELIECE6688128_AVX_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE6688128_AVX_fft_tr(vec256 * /*out*/, vec256 /*in*/[][ GFBITS ]); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/gf.c b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/gf.c new file mode 100644 index 0000000000..e1d66bcd82 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/gf.c @@ -0,0 +1,236 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +#include + +/* field multiplication */ +gf PQCLEAN_MCELIECE6688128_AVX_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE6688128_AVX_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x01FF000001FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x0000E0000000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & 0x00001FFF00001FFF; +} + +/* 2 field squarings */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE6688128_AVX_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // 11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // 1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // 11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // 111111111111 + + return gf_sqmul(out, num); // 1111111111110 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE6688128_AVX_gf_inv(gf in) { + return PQCLEAN_MCELIECE6688128_AVX_gf_frac(in, ((gf) 1)); +} + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE6688128_AVX_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE6688128_AVX_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[255]; + + for (i = 0; i < 255; i++) { + prod[i] = 0; + } + + for (i = 0; i < 128; i++) { + for (j = 0; j < 128; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE6688128_AVX_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 254; i >= 128; i--) { + prod[i - 123] ^= PQCLEAN_MCELIECE6688128_AVX_gf_mul(prod[i], (gf) 7682); + prod[i - 125] ^= PQCLEAN_MCELIECE6688128_AVX_gf_mul(prod[i], (gf) 2159); + prod[i - 128] ^= PQCLEAN_MCELIECE6688128_AVX_gf_mul(prod[i], (gf) 3597); + } + + for (i = 0; i < 128; i++) { + out[i] = prod[i]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/gf.h b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/gf.h new file mode 100644 index 0000000000..f38ce69032 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/gf.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_GF_H +#define PQCLEAN_MCELIECE6688128_AVX_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE6688128_AVX_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE6688128_AVX_gf_mul(gf /*in0*/, gf /*in1*/); +uint64_t PQCLEAN_MCELIECE6688128_AVX_gf_mul2(gf /*a*/, gf /*b0*/, gf /*b1*/); +gf PQCLEAN_MCELIECE6688128_AVX_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE6688128_AVX_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE6688128_AVX_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/int32_sort.c b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/int32_sort.c new file mode 100644 index 0000000000..d55525e73a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/int32_sort.c @@ -0,0 +1,1211 @@ +#include "int32_sort.h" +#define int32 int32_t + +#include + +typedef __m256i int32x8; +#define int32x8_load(z) _mm256_loadu_si256((__m256i_u *) (z)) +#define int32x8_store(z,i) _mm256_storeu_si256((__m256i_u *) (z),(i)) +#define int32x8_min _mm256_min_epi32 +#define int32x8_max _mm256_max_epi32 + +#define int32x8_MINMAX(a,b) \ + do { \ + int32x8 c = int32x8_min(a, b); \ + (b) = int32x8_max(a,b); \ + (a) = c; \ + } while(0) + +static inline void int32_MINMAX(int32 *a, int32 *b) { + int32 ab = *b ^ *a; + int32 c = *b - *a; + c ^= ab & (c ^ *b); + c >>= 31; + c &= ab; + *a ^= c; + *b ^= c; +} + +static void minmax_vector(int32 *x, int32 *y, size_t n) { + if (n < 8) { + while (n > 0) { + int32_MINMAX(x, y); + ++x; + ++y; + --n; + } + return; + } + if (n & 7) { + int32x8 x0 = int32x8_load(x + n - 8); + int32x8 y0 = int32x8_load(y + n - 8); + int32x8_MINMAX(x0, y0); + int32x8_store(x + n - 8, x0); + int32x8_store(y + n - 8, y0); + n &= ~7; + } + do { + int32x8 x0 = int32x8_load(x); + int32x8 y0 = int32x8_load(y); + int32x8_MINMAX(x0, y0); + int32x8_store(x, x0); + int32x8_store(y, y0); + x += 8; + y += 8; + n -= 8; + } while (n); +} + +/* stages 8,4,2,1 of size-16 bitonic merging */ +static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) { + int32x8 b0, b1, c0, c1, mask; + + int32x8_MINMAX(x0, x1); + + b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + + int32x8_MINMAX(b0, b1); + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */ + + x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */ + x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */ + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + } + + int32x8_store(&x[0], x0); + int32x8_store(&x[8], x1); +} + +/* stages 64,32 of bitonic merging; n is multiple of 128 */ +static void int32_twostages_32(int32 *x, size_t n) { + size_t i; + + while (n > 0) { + for (i = 0; i < 32; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + 32]); + int32x8 x2 = int32x8_load(&x[i + 64]); + int32x8 x3 = int32x8_load(&x[i + 96]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 32], x1); + int32x8_store(&x[i + 64], x2); + int32x8_store(&x[i + 96], x3); + } + x += 128; + n -= 128; + } +} + +/* stages 4q,2q,q of bitonic merging */ +static size_t int32_threestages(int32 *x, size_t n, size_t q) { + size_t k, i; + + for (k = 0; k + 8 * q <= n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + + return k; +} + +/* n is a power of 2; n >= 8; if n == 8 then flagdown */ +// NOLINTNEXTLINE(google-readability-function-size) +static void int32_sort_2power(int32 *x, size_t n, int flagdown) { + size_t p, q, i, j, k; + int32x8 mask; + + if (n == 8) { + int32 x0 = x[0]; + int32 x1 = x[1]; + int32 x2 = x[2]; + int32 x3 = x[3]; + int32 x4 = x[4]; + int32 x5 = x[5]; + int32 x6 = x[6]; + int32 x7 = x[7]; + + /* odd-even sort instead of bitonic sort */ + + int32_MINMAX(&x1, &x0); + int32_MINMAX(&x3, &x2); + int32_MINMAX(&x2, &x0); + int32_MINMAX(&x3, &x1); + int32_MINMAX(&x2, &x1); + + int32_MINMAX(&x5, &x4); + int32_MINMAX(&x7, &x6); + int32_MINMAX(&x6, &x4); + int32_MINMAX(&x7, &x5); + int32_MINMAX(&x6, &x5); + + int32_MINMAX(&x4, &x0); + int32_MINMAX(&x6, &x2); + int32_MINMAX(&x4, &x2); + + int32_MINMAX(&x5, &x1); + int32_MINMAX(&x7, &x3); + int32_MINMAX(&x5, &x3); + + int32_MINMAX(&x2, &x1); + int32_MINMAX(&x4, &x3); + int32_MINMAX(&x6, &x5); + + x[0] = x0; + x[1] = x1; + x[2] = x2; + x[3] = x3; + x[4] = x4; + x[5] = x5; + x[6] = x6; + x[7] = x7; + return; + } + + if (n == 16) { + int32x8 x0, x1, b0, b1, c0, c1; + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + + mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1); + + x0 ^= mask; /* A01234567 */ + x1 ^= mask; /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + c0 ^= mask; + c1 ^= mask; + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + b0 ^= mask; + b1 ^= mask; + + c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */ + c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */ + b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + mask = _mm256_set1_epi32(-1); + if (flagdown) { + x1 ^= mask; + } else { + x0 ^= mask; + } + + merge16_finish(x, x0, x1, flagdown); + return; + } + + if (n == 32) { + int32x8 x0, x1, x2, x3; + + int32_sort_2power(x, 16, 1); + int32_sort_2power(x + 16, 16, 0); + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + x2 = int32x8_load(&x[16]); + x3 = int32x8_load(&x[24]); + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + } + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + + merge16_finish(x, x0, x1, flagdown); + merge16_finish(x + 16, x2, x3, flagdown); + return; + } + + p = n >> 3; + for (i = 0; i < p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x2 = int32x8_load(&x[i + 2 * p]); + int32x8 x4 = int32x8_load(&x[i + 4 * p]); + int32x8 x6 = int32x8_load(&x[i + 6 * p]); + + /* odd-even stage instead of bitonic stage */ + + int32x8_MINMAX(x4, x0); + int32x8_MINMAX(x6, x2); + int32x8_MINMAX(x2, x0); + int32x8_MINMAX(x6, x4); + int32x8_MINMAX(x2, x4); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 2 * p], x2); + int32x8_store(&x[i + 4 * p], x4); + int32x8_store(&x[i + 6 * p], x6); + + int32x8 x1 = int32x8_load(&x[i + p]); + int32x8 x3 = int32x8_load(&x[i + 3 * p]); + int32x8 x5 = int32x8_load(&x[i + 5 * p]); + int32x8 x7 = int32x8_load(&x[i + 7 * p]); + + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x5, x3); + + int32x8_store(&x[i + p], x1); + int32x8_store(&x[i + 3 * p], x3); + int32x8_store(&x[i + 5 * p], x5); + int32x8_store(&x[i + 7 * p], x7); + } + + if (n >= 128) { + int flip, flipflip; + + mask = _mm256_set1_epi32(-1); + + for (j = 0; j < n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 16]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 16], x1); + } + + p = 8; + for (;;) { /* for p in [8, 16, ..., n/16] */ + q = p >> 1; + while (q >= 128) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + if (q == 64) { + int32_twostages_32(x, n); + q = 16; + } + if (q == 32) { + q = 8; + for (k = 0; k < n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q = 4; + } + if (q == 16) { + q = 8; + for (k = 0; k < n; k += 4 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q = 4; + } + if (q == 8) { + for (k = 0; k < n; k += q + q) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + } + } + + q = n >> 3; + flip = 0; + if (p << 1 == q) { + flip = 1; + } + flipflip = 1 - flip; + for (j = 0; j < q; j += p + p) { + for (k = j; k < j + p + p; k += p) { + for (i = k; i < k + p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + if (flip) { + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + x4 ^= mask; + x5 ^= mask; + x6 ^= mask; + x7 ^= mask; + } + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + flip ^= 1; + } + flip ^= flipflip; + } + + if (p << 4 == n) { + break; + } + p <<= 1; + } + } + + for (p = 4; p >= 1; p >>= 1) { + int32 *z = x; + int32 *target = x + n; + if (p == 4) { + mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&z[0], x0); + int32x8_store(&z[8], x1); + z += 16; + } + } else if (p == 2) { + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8_MINMAX(b0, b1); + int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20); + int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31); + int32x8_store(&z[0], c0); + int32x8_store(&z[8], c1); + z += 16; + } + } else { /* p == 1 */ + mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */ + int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */ + int32x8_MINMAX(d0, d1); + int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20); + int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31); + int32x8_store(&z[0], e0); + int32x8_store(&z[8], e1); + z += 16; + } + } + + q = n >> 4; + while (q >= 128 || q == 32) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (k = j; k < j + q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += 2 * q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (k = 0; k < q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + int32x8 x4 = int32x8_load(&x[k + 4 * q]); + int32x8 x5 = int32x8_load(&x[k + 5 * q]); + int32x8 x6 = int32x8_load(&x[k + 6 * q]); + int32x8 x7 = int32x8_load(&x[k + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + int32x8_store(&x[k + 4 * q], x4); + int32x8_store(&x[k + 5 * q], x5); + int32x8_store(&x[k + 6 * q], x6); + int32x8_store(&x[k + 7 * q], x7); + } + } + + /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */ + mask = _mm256_set1_epi32(-1); + + for (i = 0; i < n; i += 64) { + int32x8 a0 = int32x8_load(&x[i]); + int32x8 a1 = int32x8_load(&x[i + 8]); + int32x8 a2 = int32x8_load(&x[i + 16]); + int32x8 a3 = int32x8_load(&x[i + 24]); + int32x8 a4 = int32x8_load(&x[i + 32]); + int32x8 a5 = int32x8_load(&x[i + 40]); + int32x8 a6 = int32x8_load(&x[i + 48]); + int32x8 a7 = int32x8_load(&x[i + 56]); + + int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */ + int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */ + int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */ + int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */ + int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */ + int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */ + int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */ + + if (flagdown) { + c2 ^= mask; + c3 ^= mask; + c6 ^= mask; + c7 ^= mask; + } else { + c0 ^= mask; + c1 ^= mask; + c4 ^= mask; + c5 ^= mask; + } + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */ + int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */ + int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */ + + int32x8_MINMAX(d0, d1); + int32x8_MINMAX(d2, d3); + int32x8_MINMAX(d4, d5); + int32x8_MINMAX(d6, d7); + int32x8_MINMAX(d0, d2); + int32x8_MINMAX(d1, d3); + int32x8_MINMAX(d4, d6); + int32x8_MINMAX(d5, d7); + int32x8_MINMAX(d0, d4); + int32x8_MINMAX(d1, d5); + int32x8_MINMAX(d2, d6); + int32x8_MINMAX(d3, d7); + + int32x8 e0 = _mm256_unpacklo_epi32(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi32(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi32(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi32(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi32(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi32(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi32(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi32(d6, d7); + + int32x8 f0 = _mm256_unpacklo_epi64(e0, e2); + int32x8 f1 = _mm256_unpacklo_epi64(e1, e3); + int32x8 f2 = _mm256_unpackhi_epi64(e0, e2); + int32x8 f3 = _mm256_unpackhi_epi64(e1, e3); + int32x8 f4 = _mm256_unpacklo_epi64(e4, e6); + int32x8 f5 = _mm256_unpacklo_epi64(e5, e7); + int32x8 f6 = _mm256_unpackhi_epi64(e4, e6); + int32x8 f7 = _mm256_unpackhi_epi64(e5, e7); + + int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20); + int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20); + int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20); + int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20); + int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31); + int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31); + int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31); + int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31); + + int32x8_store(&x[i], g0); + int32x8_store(&x[i + 8], g1); + int32x8_store(&x[i + 16], g2); + int32x8_store(&x[i + 24], g3); + int32x8_store(&x[i + 32], g4); + int32x8_store(&x[i + 40], g5); + int32x8_store(&x[i + 48], g6); + int32x8_store(&x[i + 56], g7); + } + + q = n >> 4; + while (q >= 128 || q == 32) { + q >>= 2; + for (j = 0; j < n; j += 8 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q >>= 1; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += q + q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (i = 0; i < q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */ + int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */ + int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */ + int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */ + int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */ + int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */ + int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */ + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */ + int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */ + int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */ + + if (flagdown) { + d0 ^= mask; + d1 ^= mask; + d2 ^= mask; + d3 ^= mask; + d4 ^= mask; + d5 ^= mask; + d6 ^= mask; + d7 ^= mask; + } + + int32x8_store(&x[i], d0); + int32x8_store(&x[i + q], d4); + int32x8_store(&x[i + 2 * q], d1); + int32x8_store(&x[i + 3 * q], d5); + int32x8_store(&x[i + 4 * q], d2); + int32x8_store(&x[i + 5 * q], d6); + int32x8_store(&x[i + 6 * q], d3); + int32x8_store(&x[i + 7 * q], d7); + } +} + +void PQCLEAN_MCELIECE6688128_AVX_int32_sort(int32 *x, size_t n) { + size_t q, i, j; + + if (n <= 8) { + if (n == 8) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + int32_MINMAX(&x[6], &x[7]); + } + if (n >= 7) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + } + if (n >= 6) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + } + if (n >= 5) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + } + if (n >= 4) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + } + if (n >= 3) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + } + if (n >= 2) { + int32_MINMAX(&x[0], &x[1]); + } + return; + } + + if (!(n & (n - 1))) { + int32_sort_2power(x, n, 0); + return; + } + + q = 8; + while (q < n - q) { + q += q; + } + /* n > q >= 8 */ + + if (q <= 128) { /* n <= 256 */ + int32x8 y[32]; + for (i = q >> 3; i < q >> 2; ++i) { + y[i] = _mm256_set1_epi32(0x7fffffff); + } + for (i = 0; i < n; ++i) { + ((int32 *)y)[i] = x[i]; + } + int32_sort_2power((int32 *) y, 2 * q, 0); + for (i = 0; i < n; ++i) { + x[i] = ((int32 *) y)[i]; + } + return; + } + + int32_sort_2power(x, q, 1); + PQCLEAN_MCELIECE6688128_AVX_int32_sort(x + q, n - q); + + while (q >= 64) { + q >>= 2; + j = int32_threestages(x, n, q); + minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j); + if (j + 4 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + j += 4 * q; + } + minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j); + if (j + 2 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + } + j += 2 * q; + } + minmax_vector(x + j, x + j + q, n - q - j); + q >>= 1; + } + if (q == 32) { + j = 0; + for (; j + 64 <= n; j += 64) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8 x4 = int32x8_load(&x[j + 32]); + int32x8 x5 = int32x8_load(&x[j + 40]); + int32x8 x6 = int32x8_load(&x[j + 48]); + int32x8 x7 = int32x8_load(&x[j + 56]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20); + int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31); + int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20); + int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8_MINMAX(a4, a5); + int32x8_MINMAX(a6, a7); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20); + int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31); + int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20); + int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8 c4 = _mm256_unpacklo_epi64(b4, b5); + int32x8 c5 = _mm256_unpackhi_epi64(b4, b5); + int32x8 c6 = _mm256_unpacklo_epi64(b6, b7); + int32x8 c7 = _mm256_unpackhi_epi64(b6, b7); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8_MINMAX(c4, c5); + int32x8_MINMAX(c6, c7); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 d4 = _mm256_unpacklo_epi32(c4, c5); + int32x8 d5 = _mm256_unpackhi_epi32(c4, c5); + int32x8 d6 = _mm256_unpacklo_epi32(c6, c7); + int32x8 d7 = _mm256_unpackhi_epi32(c6, c7); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi64(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi64(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi64(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi64(d6, d7); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8_MINMAX(e4, e5); + int32x8_MINMAX(e6, e7); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8 f4 = _mm256_unpacklo_epi32(e4, e5); + int32x8 f5 = _mm256_unpackhi_epi32(e4, e5); + int32x8 f6 = _mm256_unpacklo_epi32(e6, e7); + int32x8 f7 = _mm256_unpackhi_epi32(e6, e7); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + int32x8_store(&x[j + 32], f4); + int32x8_store(&x[j + 40], f5); + int32x8_store(&x[j + 48], f6); + int32x8_store(&x[j + 56], f7); + } + minmax_vector(x + j, x + j + 32, n - 32 - j); + goto continue16; + } + if (q == 16) { + j = 0; +continue16: + for (; j + 32 <= n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + } + minmax_vector(x + j, x + j + 16, n - 16 - j); + goto continue8; + } + /* q == 8 */ + j = 0; +continue8: + for (; j + 16 <= n; j += 16) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 8], x1); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */ + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */ + int32x8_MINMAX(a0, a1); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */ + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */ + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */ + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */ + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */ + int32x8_MINMAX(e0, e1); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */ + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */ + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + } + minmax_vector(x + j, x + j + 8, n - 8 - j); + if (j + 8 <= n) { + int32_MINMAX(&x[j], &x[j + 4]); + int32_MINMAX(&x[j + 1], &x[j + 5]); + int32_MINMAX(&x[j + 2], &x[j + 6]); + int32_MINMAX(&x[j + 3], &x[j + 7]); + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + int32_MINMAX(&x[j + 4], &x[j + 6]); + int32_MINMAX(&x[j + 5], &x[j + 7]); + int32_MINMAX(&x[j + 4], &x[j + 5]); + int32_MINMAX(&x[j + 6], &x[j + 7]); + j += 8; + } + minmax_vector(x + j, x + j + 4, n - 4 - j); + if (j + 4 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + j += 4; + } + if (j + 3 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + } + if (j + 2 <= n) { + int32_MINMAX(&x[j], &x[j + 1]); + } +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/int32_sort.h b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/int32_sort.h new file mode 100644 index 0000000000..049dae1629 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/int32_sort.h @@ -0,0 +1,9 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_INT32_SORT_H +#define PQCLEAN_MCELIECE6688128_AVX_INT32_SORT_H + +#include +#include + +void PQCLEAN_MCELIECE6688128_AVX_int32_sort(int32_t *x, size_t n); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/operations.c b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/operations.c new file mode 100644 index 0000000000..c9b9587f5f --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE6688128_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE6688128_AVX_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE6688128_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE6688128_AVX_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE6688128_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE6688128_AVX_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE6688128_AVX_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE6688128_AVX_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE6688128_AVX_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE6688128_AVX_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE6688128_AVX_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE6688128_AVX_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE6688128_AVX_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/params.h b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/params.h new file mode 100644 index 0000000000..cc604978d8 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_PARAMS_H +#define PQCLEAN_MCELIECE6688128_AVX_PARAMS_H + +#define GFBITS 13 +#define SYS_N 6688 +#define SYS_T 128 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/pk_gen.c b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/pk_gen.c new file mode 100644 index 0000000000..d4eff2571f --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/pk_gen.c @@ -0,0 +1,286 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +static void de_bitslicing(uint64_t *out, vec256 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 256 + 0 * 64 + r] <<= 1; + out[i * 256 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 256 + 1 * 64 + r] <<= 1; + out[i * 256 + 1 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i][j], 2); + for (r = 0; r < 64; r++) { + out[i * 256 + 2 * 64 + r] <<= 1; + out[i * 256 + 2 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(in[i][j], 3); + for (r = 0; r < 64; r++) { + out[i * 256 + 3 * 64 + r] <<= 1; + out[i * 256 + 3 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec256 out0[][GFBITS], vec256 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[4] = {0}; + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + } +} + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 255) / 256) +#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) +#define NBLOCKS2_I ((GFBITS * SYS_T + 255) / 256) +int PQCLEAN_MCELIECE6688128_AVX_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + const int block_idx = NBLOCKS1_I; + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 4 ]; + uint64_t ops[ GFBITS * SYS_T ][ NBLOCKS1_I ]; + + uint64_t mask; + + vec128 sk_int[ GFBITS ]; + + vec256 consts[ 32 ][ GFBITS ]; + vec256 eval[ 32 ][ GFBITS ]; + vec256 prod[ 32 ][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + uint64_t one_row[ 128 ]; + + // compute the inverses + + PQCLEAN_MCELIECE6688128_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE6688128_AVX_fft(eval, sk_int); + + PQCLEAN_MCELIECE6688128_AVX_vec256_copy(prod[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE6688128_AVX_vec256_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6688128_AVX_vec256_inv(tmp, prod[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE6688128_AVX_vec256_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE6688128_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6688128_AVX_vec256_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE6688128_AVX_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_I; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_I; j++) { + PQCLEAN_MCELIECE6688128_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + // gaussian elimination to obtain an upper triangular matrix + // and keep track of the operations in ops + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < NBLOCKS1_I; j++) { + ops[ i ][ j ] = 0; + } + } + + for (i = 0; i < PK_NROWS; i++) { + ops[ i ][ i / 64 ] = 1; + ops[ i ][ i / 64 ] <<= (i % 64); + } + + uint64_t column[ PK_NROWS ]; + + for (i = 0; i < PK_NROWS; i++) { + column[i] = mat[ i ][ block_idx ]; + } + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_I; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + ops[ row ][ c ] ^= ops[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_I; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // computing the lineaer map required to obatin the systematic form + + for (row = PK_NROWS - 1; row >= 0; row--) { + for (k = 0; k < row; k++) { + mask = mat[ k ][ row / 64 ] >> (row & 63); + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_I; c++) { + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // apply the linear map to the non-systematic part + + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE6688128_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6688128_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + for (i = 0; i < PK_NROWS; i++) { + mat[ i ][ block_idx ] = column[i]; + } + + for (row = 0; row < PK_NROWS; row++) { + for (k = 0; k < NBLOCKS1_H; k++) { + one_row[ k ] = 0; + } + + for (c = 0; c < PK_NROWS; c++) { + mask = ops[ row ][ c >> 6 ] >> (c & 63); + mask &= 1; + mask = -mask; + + for (k = block_idx; k < NBLOCKS1_H; k++) { + one_row[ k ] ^= mat[ c ][ k ] & mask; + } + } + + for (k = block_idx; k < NBLOCKS1_H - 1; k++) { + PQCLEAN_MCELIECE6688128_AVX_store8(pk, one_row[k]); + pk += 8; + } + + PQCLEAN_MCELIECE6688128_AVX_store_i(pk, one_row[k], PK_ROW_BYTES % 8); + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/pk_gen.h b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/pk_gen.h new file mode 100644 index 0000000000..479d007c9a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_PK_GEN_H +#define PQCLEAN_MCELIECE6688128_AVX_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE6688128_AVX_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/powers.inc b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/powers.inc new file mode 100644 index 0000000000..1c35627ab0 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/powers.inc @@ -0,0 +1,480 @@ +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/scalars_2x.inc b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/scalars_2x.inc new file mode 100644 index 0000000000..5f405cade6 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/scalars_2x.inc @@ -0,0 +1,75 @@ +{ + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000) +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/scalars_4x.inc b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/scalars_4x.inc new file mode 100644 index 0000000000..ce7ae95030 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/scalars_4x.inc @@ -0,0 +1,91 @@ +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330, 0XF0F30C33CF03F03F, 0X3F30CC0C000F3FCC), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C, 0X00F30FC00C3300FF, 0XFC3CF030FC3FFF03), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC, 0XF3CC3CF3F3FCF33F, 0X33FFFCFF0CCF3CC3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30, 0X3C0FC0FC303C3F3C, 0X003CFF33C3CC30CF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC, 0XFC30CF303F3FF00F, 0XCFF3CF33C00F3003), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F, 0X33300C0CC3300CF3, 0X00F3CC0CF3003CCF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF, 0X3C030CF3F03FF3F3, 0X3C000CFCCC3C3333), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3, 0X3CCC03FCCC3FFC03, 0XF3CF03C0FCF03FF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030, 0X033C3C3CF0003FC3, 0X3F3C3CF0C330330C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC, 0XFFC0FF00F0FF0F03, 0X33CCFCC0FF0033F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3, 0XF3F30CF003FCC303, 0X33C300C0F0C003F3), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF, 0X30CFCFC3CC0F3000, 0X003FF0003F00C00C), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33, 0X0CF30CCF3FCFCC0F, 0XCFF3C3033F030FFF) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00, 0X0F0F00FF0FF0FFFF, 0XFF0F0F00F000F0FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF, 0XF000F0F00F00FF0F, 0X0FFFFFFFFF00000F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF, 0X000FFFF0FFF0FF0F, 0XF0FFFF000F00F0FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F, 0X00F00FFF00000FF0, 0X0F0000F00FFF0FFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFF0000F0FF000F0, 0XFF000000FFF00000, 0XFFFFF0000FFFF00F, 0X0F0F0F00FF0F000F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0, 0XFFF0FFF0000FFFF0, 0X000F0F0FFFF0F000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F, 0XF0F0F0000F0F0F00, 0XF0FFFF0F00F0FF0F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000, 0X00F000F0F00FFF00, 0X0F0F000F0F00F0FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00F000F0FFF00F00, 0X00F00F00F00F000F, 0XF0FF0F0FFF00F0FF, 0X0000F0FF00FF0F0F), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0, 0XF0FF0FFFF0F0F0FF, 0X00FFFF0FF0FFF0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF, 0X00FFFFFFFFFFFFF0, 0X0000000F00F0FFF0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF, 0X00FFF0F0FF000F0F, 0XF0F00000FF00F0F0), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00, 0X000FFFF0000FFF00, 0X0F0F0FFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00FF0000000000FF, 0X00FF00FF00FF0000, 0XFFFF00FF00FF00FF, 0XFF0000FFFFFF00FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF, 0X00FFFF000000FF00, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FF00FF0000, 0X0000FFFF000000FF, 0XFFFF00FFFFFFFF00, 0XFFFF000000FFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF000000FF0000, 0X00FFFF00FF000000, 0X0000FFFF00FFFFFF, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF, 0X00FF0000FF0000FF, 0XFFFFFF00FFFFFF00), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00, 0XFFFF0000FF00FFFF, 0X00FFFF00FFFF00FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00, 0XFF000000FFFFFF00, 0X0000FFFF00FF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF, 0X000000000000FFFF, 0X000000FFFF000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00FFFFFFFF00FF00, 0X0000FF0000000000, 0XFF00FF00FFFF0000, 0XFF00FF0000FF00FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF000000000000, 0XFF00FFFF00FFFF00, 0XFFFF00FFFF00FFFF, 0X00FF0000000000FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00FF0000FF000000, 0X00FFFF00000000FF, 0XFFFFFFFFFF00FF00, 0XFF00FFFF00FF00FF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF, 0XFFFF00FFFF0000FF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000, 0X0000FF00000000FF, 0X0000FF000000FFFF) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X000000000000FFFF, 0X0000FFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000FFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF, 0XFFFFFFFF00000000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFF000000000000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000000000000000, 0X0000FFFF00000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000, 0X00000000FFFF0000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFF0000FFFF0000, 0X0000FFFFFFFFFFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0X0000FFFFFFFFFFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFFFFFF00000000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF, 0X000000000000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF000000000000, 0X00000000FFFF0000, 0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFFFFFF0000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFF0000FFFF, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFF0000FFFFFFFF, 0X0000FFFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000) +}, +{ + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF) +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/sk_gen.c b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/sk_gen.c new file mode 100644 index 0000000000..da99dae559 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE6688128_AVX_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE6688128_AVX_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE6688128_AVX_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE6688128_AVX_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE6688128_AVX_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE6688128_AVX_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE6688128_AVX_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE6688128_AVX_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/sk_gen.h b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/sk_gen.h new file mode 100644 index 0000000000..1e5c04e2ae --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_SK_GEN_H +#define PQCLEAN_MCELIECE6688128_AVX_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE6688128_AVX_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE6688128_AVX_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/syndrome_asm.S b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/syndrome_asm.S new file mode 100644 index 0000000000..2152decf03 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/syndrome_asm.S @@ -0,0 +1,810 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: reg256 pp + +# qhasm: reg256 ee + +# qhasm: reg256 ss + +# qhasm: int64 buf_ptr + +# qhasm: stack256 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6688128_AVX_syndrome_asm +.global PQCLEAN_MCELIECE6688128_AVX_syndrome_asm +_PQCLEAN_MCELIECE6688128_AVX_syndrome_asm: +PQCLEAN_MCELIECE6688128_AVX_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 1044364 +# asm 1: add $1044364,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 1664 +# asm 1: mov $1664,>row=int64#5 +# asm 2: mov $1664,>row=%r8 +mov $1664,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 208 ] +# asm 1: vmovupd 208(ee=reg256#2 +# asm 2: vmovupd 208(ee=%ymm1 +vmovupd 208(%rdx),%ymm1 + +# qhasm: ss &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 32(pp=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 240 ] +# asm 1: vmovupd 240(ee=reg256#3 +# asm 2: vmovupd 240(ee=%ymm2 +vmovupd 240(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 64(pp=%ymm1 +vmovupd 64(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 272 ] +# asm 1: vmovupd 272(ee=reg256#3 +# asm 2: vmovupd 272(ee=%ymm2 +vmovupd 272(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 96(pp=%ymm1 +vmovupd 96(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 304 ] +# asm 1: vmovupd 304(ee=reg256#3 +# asm 2: vmovupd 304(ee=%ymm2 +vmovupd 304(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 128(pp=%ymm1 +vmovupd 128(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 336 ] +# asm 1: vmovupd 336(ee=reg256#3 +# asm 2: vmovupd 336(ee=%ymm2 +vmovupd 336(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 160(pp=%ymm1 +vmovupd 160(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 368 ] +# asm 1: vmovupd 368(ee=reg256#3 +# asm 2: vmovupd 368(ee=%ymm2 +vmovupd 368(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 192(pp=%ymm1 +vmovupd 192(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 400 ] +# asm 1: vmovupd 400(ee=reg256#3 +# asm 2: vmovupd 400(ee=%ymm2 +vmovupd 400(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 224(pp=%ymm1 +vmovupd 224(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 432 ] +# asm 1: vmovupd 432(ee=reg256#3 +# asm 2: vmovupd 432(ee=%ymm2 +vmovupd 432(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 256(pp=%ymm1 +vmovupd 256(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 464 ] +# asm 1: vmovupd 464(ee=reg256#3 +# asm 2: vmovupd 464(ee=%ymm2 +vmovupd 464(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 288(pp=%ymm1 +vmovupd 288(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 496 ] +# asm 1: vmovupd 496(ee=reg256#3 +# asm 2: vmovupd 496(ee=%ymm2 +vmovupd 496(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 320(pp=%ymm1 +vmovupd 320(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 528 ] +# asm 1: vmovupd 528(ee=reg256#3 +# asm 2: vmovupd 528(ee=%ymm2 +vmovupd 528(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 352(pp=%ymm1 +vmovupd 352(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 560 ] +# asm 1: vmovupd 560(ee=reg256#3 +# asm 2: vmovupd 560(ee=%ymm2 +vmovupd 560(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 384(pp=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 592 ] +# asm 1: vmovupd 592(ee=reg256#3 +# asm 2: vmovupd 592(ee=%ymm2 +vmovupd 592(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 416(pp=%ymm1 +vmovupd 416(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 624 ] +# asm 1: vmovupd 624(ee=reg256#3 +# asm 2: vmovupd 624(ee=%ymm2 +vmovupd 624(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 448(pp=%ymm1 +vmovupd 448(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 656 ] +# asm 1: vmovupd 656(ee=reg256#3 +# asm 2: vmovupd 656(ee=%ymm2 +vmovupd 656(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 480(pp=%ymm1 +vmovupd 480(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 688 ] +# asm 1: vmovupd 688(ee=reg256#3 +# asm 2: vmovupd 688(ee=%ymm2 +vmovupd 688(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 512(pp=%ymm1 +vmovupd 512(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 720 ] +# asm 1: vmovupd 720(ee=reg256#3 +# asm 2: vmovupd 720(ee=%ymm2 +vmovupd 720(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 544(pp=%ymm1 +vmovupd 544(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 752 ] +# asm 1: vmovupd 752(ee=reg256#3 +# asm 2: vmovupd 752(ee=%ymm2 +vmovupd 752(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 576(pp=%ymm1 +vmovupd 576(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 784 ] +# asm 1: vmovupd 784(ee=reg256#3 +# asm 2: vmovupd 784(ee=%ymm2 +vmovupd 784(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand buf=stack256#1 +# asm 2: vmovapd buf=0(%rsp) +vmovapd %ymm0,0(%rsp) + +# qhasm: s = mem64[input_1 + 608] +# asm 1: movq 608(s=int64#6 +# asm 2: movq 608(s=%r9 +movq 608(%rsi),%r9 + +# qhasm: e = mem64[input_2 + 816] +# asm 1: movq 816(e=int64#7 +# asm 2: movq 816(e=%rax +movq 816(%rdx),%rax + +# qhasm: s &= e +# asm 1: and p=int64#7 +# asm 2: movq 616(p=%rax +movq 616(%rsi),%rax + +# qhasm: e = mem64[input_2 + 824] +# asm 1: movq 824(e=int64#8 +# asm 2: movq 824(e=%r10 +movq 824(%rdx),%r10 + +# qhasm: p &= e +# asm 1: and p=int64#7d +# asm 2: movl 624(p=%eax +movl 624(%rsi),%eax + +# qhasm: e = *(uint32 *)(input_2 + 832) +# asm 1: movl 832(e=int64#8d +# asm 2: movl 832(e=%r10d +movl 832(%rdx),%r10d + +# qhasm: p &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 16(b64=%rax +movq 16(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 24(b64=%rax +movq 24(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(ee=reg256#2 +# asm 2: vmovupd 0(ee=%ymm1 +vmovupd 0(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 32(ss=%ymm0 +vmovupd 32(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(ee=reg256#2 +# asm 2: vmovupd 32(ee=%ymm1 +vmovupd 32(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 64(ss=%ymm0 +vmovupd 64(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(ee=reg256#2 +# asm 2: vmovupd 64(ee=%ymm1 +vmovupd 64(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 96(ss=%ymm0 +vmovupd 96(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 96 ] +# asm 1: vmovupd 96(ee=reg256#2 +# asm 2: vmovupd 96(ee=%ymm1 +vmovupd 96(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 128(ss=%ymm0 +vmovupd 128(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 128 ] +# asm 1: vmovupd 128(ee=reg256#2 +# asm 2: vmovupd 128(ee=%ymm1 +vmovupd 128(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 160(ss=%ymm0 +vmovupd 160(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 160 ] +# asm 1: vmovupd 160(ee=reg256#2 +# asm 2: vmovupd 160(ee=%ymm1 +vmovupd 160(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor s=int64#2 +# asm 2: movq 192(s=%rsi +movq 192(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 192 ] +# asm 1: movq 192(e=int64#4 +# asm 2: movq 192(e=%rcx +movq 192(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movq 200(s=%rsi +movq 200(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 200 ] +# asm 1: movq 200(e=int64#3 +# asm 2: movq 200(e=%rdx +movq 200(%rdx),%rdx + +# qhasm: s ^= e +# asm 1: xor mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE6688128_AVX_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE6688128_AVX_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE6688128_AVX_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE6688128_AVX_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE6688128_AVX_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE6688128_AVX_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE6688128_AVX_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE6688128_AVX_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE6688128_AVX_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE6688128_AVX_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE6688128_AVX_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE6688128_AVX_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE6688128_AVX_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE6688128_AVX_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE6688128_AVX_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE6688128_AVX_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE6688128_AVX_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE6688128_AVX_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE6688128_AVX_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE6688128_AVX_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE6688128_AVX_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE6688128_AVX_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE6688128_AVX_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK5_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK5_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE6688128_AVX_MASK5_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK5_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK5_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK5_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE6688128_AVX_MASK4_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK4_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK4_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK4_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE6688128_AVX_MASK4_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK4_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK4_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK4_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE6688128_AVX_MASK3_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK3_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK3_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK3_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE6688128_AVX_MASK3_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK3_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK3_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK3_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(x1=reg256#8 +# asm 2: vmovupd 256(x1=%ymm7 +vmovupd 256(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 512 ] +# asm 1: vmovupd 512(x2=reg256#9 +# asm 2: vmovupd 512(x2=%ymm8 +vmovupd 512(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 768 ] +# asm 1: vmovupd 768(x3=reg256#10 +# asm 2: vmovupd 768(x3=%ymm9 +vmovupd 768(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1024 ] +# asm 1: vmovupd 1024(x4=reg256#11 +# asm 2: vmovupd 1024(x4=%ymm10 +vmovupd 1024(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1280 ] +# asm 1: vmovupd 1280(x5=reg256#12 +# asm 2: vmovupd 1280(x5=%ymm11 +vmovupd 1280(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1536 ] +# asm 1: vmovupd 1536(x6=reg256#13 +# asm 2: vmovupd 1536(x6=%ymm12 +vmovupd 1536(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1792 ] +# asm 1: vmovupd 1792(x7=reg256#14 +# asm 2: vmovupd 1792(x7=%ymm13 +vmovupd 1792(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 32(x0=%ymm6 +vmovupd 32(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x2=reg256#9 +# asm 2: vmovupd 544(x2=%ymm8 +vmovupd 544(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x3=reg256#10 +# asm 2: vmovupd 800(x3=%ymm9 +vmovupd 800(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x4=reg256#11 +# asm 2: vmovupd 1056(x4=%ymm10 +vmovupd 1056(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x5=reg256#12 +# asm 2: vmovupd 1312(x5=%ymm11 +vmovupd 1312(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x6=reg256#13 +# asm 2: vmovupd 1568(x6=%ymm12 +vmovupd 1568(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x7=reg256#14 +# asm 2: vmovupd 1824(x7=%ymm13 +vmovupd 1824(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 32 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 64(x0=%ymm6 +vmovupd 64(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x1=reg256#8 +# asm 2: vmovupd 320(x1=%ymm7 +vmovupd 320(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x3=reg256#10 +# asm 2: vmovupd 832(x3=%ymm9 +vmovupd 832(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x4=reg256#11 +# asm 2: vmovupd 1088(x4=%ymm10 +vmovupd 1088(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x5=reg256#12 +# asm 2: vmovupd 1344(x5=%ymm11 +vmovupd 1344(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x6=reg256#13 +# asm 2: vmovupd 1600(x6=%ymm12 +vmovupd 1600(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x7=reg256#14 +# asm 2: vmovupd 1856(x7=%ymm13 +vmovupd 1856(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 64 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 96(x0=%ymm6 +vmovupd 96(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x1=reg256#8 +# asm 2: vmovupd 352(x1=%ymm7 +vmovupd 352(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x2=reg256#9 +# asm 2: vmovupd 608(x2=%ymm8 +vmovupd 608(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x4=reg256#11 +# asm 2: vmovupd 1120(x4=%ymm10 +vmovupd 1120(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x5=reg256#12 +# asm 2: vmovupd 1376(x5=%ymm11 +vmovupd 1376(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x6=reg256#13 +# asm 2: vmovupd 1632(x6=%ymm12 +vmovupd 1632(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x7=reg256#14 +# asm 2: vmovupd 1888(x7=%ymm13 +vmovupd 1888(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 96 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 128(x0=%ymm6 +vmovupd 128(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x1=reg256#8 +# asm 2: vmovupd 384(x1=%ymm7 +vmovupd 384(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x2=reg256#9 +# asm 2: vmovupd 640(x2=%ymm8 +vmovupd 640(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x3=reg256#10 +# asm 2: vmovupd 896(x3=%ymm9 +vmovupd 896(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x5=reg256#12 +# asm 2: vmovupd 1408(x5=%ymm11 +vmovupd 1408(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x6=reg256#13 +# asm 2: vmovupd 1664(x6=%ymm12 +vmovupd 1664(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x7=reg256#14 +# asm 2: vmovupd 1920(x7=%ymm13 +vmovupd 1920(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 128 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 160(x0=%ymm6 +vmovupd 160(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x1=reg256#8 +# asm 2: vmovupd 416(x1=%ymm7 +vmovupd 416(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x2=reg256#9 +# asm 2: vmovupd 672(x2=%ymm8 +vmovupd 672(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x3=reg256#10 +# asm 2: vmovupd 928(x3=%ymm9 +vmovupd 928(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x4=reg256#11 +# asm 2: vmovupd 1184(x4=%ymm10 +vmovupd 1184(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x6=reg256#13 +# asm 2: vmovupd 1696(x6=%ymm12 +vmovupd 1696(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x7=reg256#14 +# asm 2: vmovupd 1952(x7=%ymm13 +vmovupd 1952(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 160 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 192(x0=%ymm6 +vmovupd 192(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x1=reg256#8 +# asm 2: vmovupd 448(x1=%ymm7 +vmovupd 448(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x2=reg256#9 +# asm 2: vmovupd 704(x2=%ymm8 +vmovupd 704(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x3=reg256#10 +# asm 2: vmovupd 960(x3=%ymm9 +vmovupd 960(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x4=reg256#11 +# asm 2: vmovupd 1216(x4=%ymm10 +vmovupd 1216(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x5=reg256#12 +# asm 2: vmovupd 1472(x5=%ymm11 +vmovupd 1472(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x7=reg256#14 +# asm 2: vmovupd 1984(x7=%ymm13 +vmovupd 1984(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 192 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 224(x0=%ymm6 +vmovupd 224(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x1=reg256#8 +# asm 2: vmovupd 480(x1=%ymm7 +vmovupd 480(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x2=reg256#9 +# asm 2: vmovupd 736(x2=%ymm8 +vmovupd 736(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x3=reg256#10 +# asm 2: vmovupd 992(x3=%ymm9 +vmovupd 992(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x4=reg256#11 +# asm 2: vmovupd 1248(x4=%ymm10 +vmovupd 1248(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x5=reg256#12 +# asm 2: vmovupd 1504(x5=%ymm11 +vmovupd 1504(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x6=reg256#13 +# asm 2: vmovupd 1760(x6=%ymm12 +vmovupd 1760(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#1 +# asm 2: vpand v00=%ymm0 +vpand %ymm9,%ymm0,%ymm0 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#13 +# asm 2: vpsllq $32,v10=%ymm12 +vpsllq $32,%ymm13,%ymm12 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm0,%ymm12,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#13 +# asm 2: vpslld $16,v10=%ymm12 +vpslld $16,%ymm11,%ymm12 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#14 +# asm 2: vpsrld $16,v01=%ymm13 +vpsrld $16,%ymm14,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm0,%ymm13 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm8,%ymm13 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#3 +# asm 2: vpand v00=%ymm2 +vpand %ymm7,%ymm2,%ymm2 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#9 +# asm 2: vpslld $16,v10=%ymm8 +vpslld $16,%ymm1,%ymm8 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm2,%ymm8,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#8 +# asm 2: vpsllw $8,v10=%ymm7 +vpsllw $8,%ymm12,%ymm7 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#9 +# asm 2: vpsrlw $8,v01=%ymm8 +vpsrlw $8,%ymm9,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#10 +# asm 2: vpsllw $8,v10=%ymm9 +vpsllw $8,%ymm0,%ymm9 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#12 +# asm 2: vpsllw $8,v10=%ymm11 +vpsllw $8,%ymm2,%ymm11 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#11 +# asm 2: vpsrlw $8,v01=%ymm10 +vpsrlw $8,%ymm10,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#5 +# asm 2: vpand v00=%ymm4 +vpand %ymm6,%ymm4,%ymm4 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#11 +# asm 2: vpsllw $8,v10=%ymm10 +vpsllw $8,%ymm1,%ymm10 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm4,%ymm10,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 224 ] = x0 +# asm 1: vmovupd mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK2_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK2_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE6688128_AVX_MASK2_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK2_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK2_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK2_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE6688128_AVX_MASK1_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK1_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK1_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK1_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE6688128_AVX_MASK1_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK1_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK1_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK1_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE6688128_AVX_MASK0_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK0_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK0_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK0_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE6688128_AVX_MASK0_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK0_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK0_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE6688128_AVX_MASK0_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(x1=reg256#8 +# asm 2: vmovupd 32(x1=%ymm7 +vmovupd 32(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(x2=reg256#9 +# asm 2: vmovupd 64(x2=%ymm8 +vmovupd 64(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(x3=reg256#10 +# asm 2: vmovupd 96(x3=%ymm9 +vmovupd 96(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(x4=reg256#11 +# asm 2: vmovupd 128(x4=%ymm10 +vmovupd 128(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(x5=reg256#12 +# asm 2: vmovupd 160(x5=%ymm11 +vmovupd 160(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(x6=reg256#13 +# asm 2: vmovupd 192(x6=%ymm12 +vmovupd 192(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(x7=reg256#14 +# asm 2: vmovupd 224(x7=%ymm13 +vmovupd 224(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 256(x0=%ymm6 +vmovupd 256(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x2=reg256#9 +# asm 2: vmovupd 320(x2=%ymm8 +vmovupd 320(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x3=reg256#10 +# asm 2: vmovupd 352(x3=%ymm9 +vmovupd 352(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x4=reg256#11 +# asm 2: vmovupd 384(x4=%ymm10 +vmovupd 384(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x5=reg256#12 +# asm 2: vmovupd 416(x5=%ymm11 +vmovupd 416(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x6=reg256#13 +# asm 2: vmovupd 448(x6=%ymm12 +vmovupd 448(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x7=reg256#14 +# asm 2: vmovupd 480(x7=%ymm13 +vmovupd 480(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 256 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 512(x0=%ymm6 +vmovupd 512(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x1=reg256#8 +# asm 2: vmovupd 544(x1=%ymm7 +vmovupd 544(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x3=reg256#10 +# asm 2: vmovupd 608(x3=%ymm9 +vmovupd 608(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x4=reg256#11 +# asm 2: vmovupd 640(x4=%ymm10 +vmovupd 640(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x5=reg256#12 +# asm 2: vmovupd 672(x5=%ymm11 +vmovupd 672(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x6=reg256#13 +# asm 2: vmovupd 704(x6=%ymm12 +vmovupd 704(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x7=reg256#14 +# asm 2: vmovupd 736(x7=%ymm13 +vmovupd 736(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 512 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 768(x0=%ymm6 +vmovupd 768(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x1=reg256#8 +# asm 2: vmovupd 800(x1=%ymm7 +vmovupd 800(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x2=reg256#9 +# asm 2: vmovupd 832(x2=%ymm8 +vmovupd 832(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x4=reg256#11 +# asm 2: vmovupd 896(x4=%ymm10 +vmovupd 896(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x5=reg256#12 +# asm 2: vmovupd 928(x5=%ymm11 +vmovupd 928(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x6=reg256#13 +# asm 2: vmovupd 960(x6=%ymm12 +vmovupd 960(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x7=reg256#14 +# asm 2: vmovupd 992(x7=%ymm13 +vmovupd 992(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 768 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1024(x0=%ymm6 +vmovupd 1024(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x1=reg256#8 +# asm 2: vmovupd 1056(x1=%ymm7 +vmovupd 1056(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x2=reg256#9 +# asm 2: vmovupd 1088(x2=%ymm8 +vmovupd 1088(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x3=reg256#10 +# asm 2: vmovupd 1120(x3=%ymm9 +vmovupd 1120(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x5=reg256#12 +# asm 2: vmovupd 1184(x5=%ymm11 +vmovupd 1184(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x6=reg256#13 +# asm 2: vmovupd 1216(x6=%ymm12 +vmovupd 1216(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x7=reg256#14 +# asm 2: vmovupd 1248(x7=%ymm13 +vmovupd 1248(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1024 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1280(x0=%ymm6 +vmovupd 1280(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x1=reg256#8 +# asm 2: vmovupd 1312(x1=%ymm7 +vmovupd 1312(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x2=reg256#9 +# asm 2: vmovupd 1344(x2=%ymm8 +vmovupd 1344(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x3=reg256#10 +# asm 2: vmovupd 1376(x3=%ymm9 +vmovupd 1376(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x4=reg256#11 +# asm 2: vmovupd 1408(x4=%ymm10 +vmovupd 1408(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x6=reg256#13 +# asm 2: vmovupd 1472(x6=%ymm12 +vmovupd 1472(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x7=reg256#14 +# asm 2: vmovupd 1504(x7=%ymm13 +vmovupd 1504(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1280 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1536(x0=%ymm6 +vmovupd 1536(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x1=reg256#8 +# asm 2: vmovupd 1568(x1=%ymm7 +vmovupd 1568(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x2=reg256#9 +# asm 2: vmovupd 1600(x2=%ymm8 +vmovupd 1600(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x3=reg256#10 +# asm 2: vmovupd 1632(x3=%ymm9 +vmovupd 1632(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x4=reg256#11 +# asm 2: vmovupd 1664(x4=%ymm10 +vmovupd 1664(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x5=reg256#12 +# asm 2: vmovupd 1696(x5=%ymm11 +vmovupd 1696(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x7=reg256#14 +# asm 2: vmovupd 1760(x7=%ymm13 +vmovupd 1760(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1536 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1792(x0=%ymm6 +vmovupd 1792(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x1=reg256#8 +# asm 2: vmovupd 1824(x1=%ymm7 +vmovupd 1824(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x2=reg256#9 +# asm 2: vmovupd 1856(x2=%ymm8 +vmovupd 1856(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x3=reg256#10 +# asm 2: vmovupd 1888(x3=%ymm9 +vmovupd 1888(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x4=reg256#11 +# asm 2: vmovupd 1920(x4=%ymm10 +vmovupd 1920(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x5=reg256#12 +# asm 2: vmovupd 1952(x5=%ymm11 +vmovupd 1952(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x6=reg256#13 +# asm 2: vmovupd 1984(x6=%ymm12 +vmovupd 1984(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#1 +# asm 2: vpand v10=%ymm0 +vpand %ymm13,%ymm0,%ymm0 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm12,%ymm0,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#13 +# asm 2: vpand v10=%ymm12 +vpand %ymm11,%ymm2,%ymm12 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm14,%ymm3,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm0,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm8,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#3 +# asm 2: vpand v10=%ymm2 +vpand %ymm1,%ymm2,%ymm2 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm8,%ymm2,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#8 +# asm 2: vpand v10=%ymm7 +vpand %ymm12,%ymm4,%ymm7 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm9,%ymm5,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#10 +# asm 2: vpand v10=%ymm9 +vpand %ymm0,%ymm4,%ymm9 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#12 +# asm 2: vpand v10=%ymm11 +vpand %ymm2,%ymm4,%ymm11 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm4,%ymm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#5 +# asm 2: vpand v10=%ymm4 +vpand %ymm1,%ymm4,%ymm4 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm10,%ymm4,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 1792 ] = x0 +# asm 1: vmovupd +#include + +void PQCLEAN_MCELIECE6688128_AVX_uint32_sort(uint32_t *x, size_t n); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/update_asm.S b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/update_asm.S new file mode 100644 index 0000000000..b51f094092 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/update_asm.S @@ -0,0 +1,576 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 s0 + +# qhasm: int64 s1 + +# qhasm: int64 s2 + +# qhasm: enter update_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6688128_AVX_update_asm +.global PQCLEAN_MCELIECE6688128_AVX_update_asm +_PQCLEAN_MCELIECE6688128_AVX_update_asm: +PQCLEAN_MCELIECE6688128_AVX_update_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: s2 = input_1 +# asm 1: mov s2=int64#2 +# asm 2: mov s2=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1, + +void PQCLEAN_MCELIECE6688128_AVX_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE6688128_AVX_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE6688128_AVX_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE6688128_AVX_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE6688128_AVX_irr_load(vec128 *out, const unsigned char *in) { + int i, j; + uint64_t v[2]; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE6688128_AVX_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + v[0] = v[1] = 0; + + for (j = 63; j >= 0; j--) { + v[0] <<= 1; + v[0] |= (irr[j] >> i) & 1; + } + for (j = SYS_T; j >= 64; j--) { + v[1] <<= 1; + v[1] |= (irr[j] >> i) & 1; + } + + out[i] = PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(v[0], v[1]); + } +} + +void PQCLEAN_MCELIECE6688128_AVX_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE6688128_AVX_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +vec128 PQCLEAN_MCELIECE6688128_AVX_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE6688128_AVX_vec128_set2x( PQCLEAN_MCELIECE6688128_AVX_load8(in), PQCLEAN_MCELIECE6688128_AVX_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE6688128_AVX_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE6688128_AVX_store8(out + 0, PQCLEAN_MCELIECE6688128_AVX_vec128_extract(in, 0)); + PQCLEAN_MCELIECE6688128_AVX_store8(out + 8, PQCLEAN_MCELIECE6688128_AVX_vec128_extract(in, 1)); +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/util.h b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/util.h new file mode 100644 index 0000000000..e138b7fa70 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/util.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_UTIL_H +#define PQCLEAN_MCELIECE6688128_AVX_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE6688128_AVX_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE6688128_AVX_store2(unsigned char *dest, uint16_t a); +uint16_t PQCLEAN_MCELIECE6688128_AVX_load2(const unsigned char *src); +uint32_t PQCLEAN_MCELIECE6688128_AVX_load4(const unsigned char *src); +void PQCLEAN_MCELIECE6688128_AVX_irr_load(vec128 *out, const unsigned char *in); +void PQCLEAN_MCELIECE6688128_AVX_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE6688128_AVX_load8(const unsigned char *in); +vec128 PQCLEAN_MCELIECE6688128_AVX_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE6688128_AVX_store16(unsigned char *out, vec128 in); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/vec128.c b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/vec128.c new file mode 100644 index 0000000000..cece2e8db3 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/vec128.c @@ -0,0 +1,79 @@ +#include "vec128.h" + +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE6688128_AVX_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE6688128_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE6688128_AVX_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE6688128_AVX_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6688128_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE6688128_AVX_vec128_mul_asm(h, f, g, 16); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/vec128.h b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/vec128.h new file mode 100644 index 0000000000..56d53de5ed --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/vec128.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_MCELIECE6688128_AVX_VEC128_H +#define PQCLEAN_MCELIECE6688128_AVX_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE6688128_AVX_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE6688128_AVX_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE6688128_AVX_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE6688128_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE6688128_AVX_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE6688128_AVX_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6688128_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/vec128_mul_asm.S b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/vec128_mul_asm.S new file mode 100644 index 0000000000..fd1176c09f --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/vec128_mul_asm.S @@ -0,0 +1,1816 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 b2 + +# qhasm: reg256 b3 + +# qhasm: reg256 b4 + +# qhasm: reg256 b5 + +# qhasm: reg256 b6 + +# qhasm: reg256 b7 + +# qhasm: reg256 b8 + +# qhasm: reg256 b9 + +# qhasm: reg256 b10 + +# qhasm: reg256 b11 + +# qhasm: reg256 b12 + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: reg128 h0 + +# qhasm: reg128 h1 + +# qhasm: reg128 h2 + +# qhasm: reg128 h3 + +# qhasm: reg128 h4 + +# qhasm: reg128 h5 + +# qhasm: reg128 h6 + +# qhasm: reg128 h7 + +# qhasm: reg128 h8 + +# qhasm: reg128 h9 + +# qhasm: reg128 h10 + +# qhasm: reg128 h11 + +# qhasm: reg128 h12 + +# qhasm: reg128 h13 + +# qhasm: reg128 h14 + +# qhasm: reg128 h15 + +# qhasm: reg128 h16 + +# qhasm: reg128 h17 + +# qhasm: reg128 h18 + +# qhasm: reg128 h19 + +# qhasm: reg128 h20 + +# qhasm: reg128 h21 + +# qhasm: reg128 h22 + +# qhasm: reg128 h23 + +# qhasm: reg128 h24 + +# qhasm: stack4864 buf + +# qhasm: int64 ptr + +# qhasm: int64 tmp + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6688128_AVX_vec128_mul_asm +.global PQCLEAN_MCELIECE6688128_AVX_vec128_mul_asm +_PQCLEAN_MCELIECE6688128_AVX_vec128_mul_asm: +PQCLEAN_MCELIECE6688128_AVX_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $608,%r11 +sub %r11,%rsp + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#5 +# asm 2: leaq ptr=%r8 +leaq 0(%rsp),%r8 + +# qhasm: tmp = input_3 +# asm 1: mov tmp=int64#6 +# asm 2: mov tmp=%r9 +mov %rcx,%r9 + +# qhasm: tmp *= 12 +# asm 1: imulq $12,tmp=int64#6 +# asm 2: imulq $12,tmp=%r9 +imulq $12,%r9,%r9 + +# qhasm: input_2 += tmp +# asm 1: add b12=reg256#1 +# asm 2: vbroadcasti128 0(b12=%ymm0 +vbroadcasti128 0(%rdx), %ymm0 + +# qhasm: input_2 -= input_3 +# asm 1: sub a6=reg256#2 +# asm 2: vpxor a6=%ymm1 +vpxor %ymm1,%ymm1,%ymm1 + +# qhasm: a6[0] = mem128[ input_1 + 96 ] +# asm 1: vinsertf128 $0x0,96(r18=reg256#3 +# asm 2: vpand r18=%ymm2 +vpand %ymm0,%ymm1,%ymm2 + +# qhasm: mem256[ ptr + 576 ] = r18 +# asm 1: vmovupd r17=reg256#4 +# asm 2: vpand r17=%ymm3 +vpand %ymm0,%ymm2,%ymm3 + +# qhasm: a4[0] = mem128[ input_1 + 64 ] +# asm 1: vinsertf128 $0x0,64(r16=reg256#6 +# asm 2: vpand r16=%ymm5 +vpand %ymm0,%ymm4,%ymm5 + +# qhasm: a3[0] = mem128[ input_1 + 48 ] +# asm 1: vinsertf128 $0x0,48(r15=reg256#8 +# asm 2: vpand r15=%ymm7 +vpand %ymm0,%ymm6,%ymm7 + +# qhasm: a2[0] = mem128[ input_1 + 32 ] +# asm 1: vinsertf128 $0x0,32(r14=reg256#10 +# asm 2: vpand r14=%ymm9 +vpand %ymm0,%ymm8,%ymm9 + +# qhasm: a1[0] = mem128[ input_1 + 16 ] +# asm 1: vinsertf128 $0x0,16(r13=reg256#12 +# asm 2: vpand r13=%ymm11 +vpand %ymm0,%ymm10,%ymm11 + +# qhasm: a0[0] = mem128[ input_1 + 0 ] +# asm 1: vinsertf128 $0x0,0(r12=reg256#1 +# asm 2: vpand r12=%ymm0 +vpand %ymm0,%ymm12,%ymm0 + +# qhasm: b11 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b11=reg256#14 +# asm 2: vbroadcasti128 0(b11=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r12 ^= r +# asm 1: vpxor r11=reg256#4 +# asm 2: vpand r11=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b10 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b10=reg256#14 +# asm 2: vbroadcasti128 0(b10=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r11 ^= r +# asm 1: vpxor r10=reg256#6 +# asm 2: vpand r10=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b9 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b9=reg256#14 +# asm 2: vbroadcasti128 0(b9=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r10 ^= r +# asm 1: vpxor r9=reg256#8 +# asm 2: vpand r9=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b8 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b8=reg256#14 +# asm 2: vbroadcasti128 0(b8=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r9 ^= r +# asm 1: vpxor r8=reg256#10 +# asm 2: vpand r8=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b7 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b7=reg256#14 +# asm 2: vbroadcasti128 0(b7=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r8 ^= r +# asm 1: vpxor r7=reg256#12 +# asm 2: vpand r7=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b6 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b6=reg256#14 +# asm 2: vbroadcasti128 0(b6=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r6=reg256#1 +# asm 2: vpand r6=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: b5 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b5=reg256#14 +# asm 2: vbroadcasti128 0(b5=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r6 ^= r +# asm 1: vpxor r5=reg256#4 +# asm 2: vpand r5=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b4 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b4=reg256#14 +# asm 2: vbroadcasti128 0(b4=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#6 +# asm 2: vpand r4=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b3 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b3=reg256#14 +# asm 2: vbroadcasti128 0(b3=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#8 +# asm 2: vpand r3=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b2 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b2=reg256#14 +# asm 2: vbroadcasti128 0(b2=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#10 +# asm 2: vpand r2=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b1 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b1=reg256#14 +# asm 2: vbroadcasti128 0(b1=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#12 +# asm 2: vpand r1=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b0 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b0=reg256#14 +# asm 2: vbroadcasti128 0(b0=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#1 +# asm 2: vpand r0=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: mem256[ ptr + 160 ] = r5 +# asm 1: vmovupd h24=reg128#1 +# asm 2: movdqu 560(h24=%xmm0 +movdqu 560(%r8),%xmm0 + +# qhasm: h11 = h24 +# asm 1: movdqa h11=reg128#2 +# asm 2: movdqa h11=%xmm1 +movdqa %xmm0,%xmm1 + +# qhasm: h12 = h24 +# asm 1: movdqa h12=reg128#3 +# asm 2: movdqa h12=%xmm2 +movdqa %xmm0,%xmm2 + +# qhasm: h14 = h24 +# asm 1: movdqa h14=reg128#4 +# asm 2: movdqa h14=%xmm3 +movdqa %xmm0,%xmm3 + +# qhasm: h15 = h24 +# asm 1: movdqa h15=reg128#1 +# asm 2: movdqa h15=%xmm0 +movdqa %xmm0,%xmm0 + +# qhasm: h23 = mem128[ ptr + 528 ] +# asm 1: movdqu 528(h23=reg128#5 +# asm 2: movdqu 528(h23=%xmm4 +movdqu 528(%r8),%xmm4 + +# qhasm: h10 = h23 +# asm 1: movdqa h10=reg128#6 +# asm 2: movdqa h10=%xmm5 +movdqa %xmm4,%xmm5 + +# qhasm: h11 = h11 ^ h23 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm4,%xmm1,%xmm1 + +# qhasm: h13 = h23 +# asm 1: movdqa h13=reg128#7 +# asm 2: movdqa h13=%xmm6 +movdqa %xmm4,%xmm6 + +# qhasm: h14 = h14 ^ h23 +# asm 1: vpxor h14=reg128#4 +# asm 2: vpxor h14=%xmm3 +vpxor %xmm4,%xmm3,%xmm3 + +# qhasm: h22 = mem128[ ptr + 496 ] +# asm 1: movdqu 496(h22=reg128#5 +# asm 2: movdqu 496(h22=%xmm4 +movdqu 496(%r8),%xmm4 + +# qhasm: h9 = h22 +# asm 1: movdqa h9=reg128#8 +# asm 2: movdqa h9=%xmm7 +movdqa %xmm4,%xmm7 + +# qhasm: h10 = h10 ^ h22 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm4,%xmm5,%xmm5 + +# qhasm: h12 = h12 ^ h22 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm4,%xmm2,%xmm2 + +# qhasm: h13 = h13 ^ h22 +# asm 1: vpxor h13=reg128#5 +# asm 2: vpxor h13=%xmm4 +vpxor %xmm4,%xmm6,%xmm4 + +# qhasm: h21 = mem128[ ptr + 464 ] +# asm 1: movdqu 464(h21=reg128#7 +# asm 2: movdqu 464(h21=%xmm6 +movdqu 464(%r8),%xmm6 + +# qhasm: h8 = h21 +# asm 1: movdqa h8=reg128#9 +# asm 2: movdqa h8=%xmm8 +movdqa %xmm6,%xmm8 + +# qhasm: h9 = h9 ^ h21 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h11 = h11 ^ h21 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h12 = h12 ^ h21 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm6,%xmm2,%xmm2 + +# qhasm: h20 = mem128[ ptr + 432 ] +# asm 1: movdqu 432(h20=reg128#7 +# asm 2: movdqu 432(h20=%xmm6 +movdqu 432(%r8),%xmm6 + +# qhasm: h7 = h20 +# asm 1: movdqa h7=reg128#10 +# asm 2: movdqa h7=%xmm9 +movdqa %xmm6,%xmm9 + +# qhasm: h8 = h8 ^ h20 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h10 = h10 ^ h20 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h11 = h11 ^ h20 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h19 = mem128[ ptr + 400 ] +# asm 1: movdqu 400(h19=reg128#7 +# asm 2: movdqu 400(h19=%xmm6 +movdqu 400(%r8),%xmm6 + +# qhasm: h6 = h19 +# asm 1: movdqa h6=reg128#11 +# asm 2: movdqa h6=%xmm10 +movdqa %xmm6,%xmm10 + +# qhasm: h7 = h7 ^ h19 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm6,%xmm9,%xmm9 + +# qhasm: h9 = h9 ^ h19 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h10 = h10 ^ h19 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h18 = mem128[ ptr + 368 ] +# asm 1: movdqu 368(h18=reg128#7 +# asm 2: movdqu 368(h18=%xmm6 +movdqu 368(%r8),%xmm6 + +# qhasm: h18 = h18 ^ mem128[ ptr + 576 ] +# asm 1: vpxor 576(h18=reg128#7 +# asm 2: vpxor 576(h18=%xmm6 +vpxor 576(%r8),%xmm6,%xmm6 + +# qhasm: h5 = h18 +# asm 1: movdqa h5=reg128#12 +# asm 2: movdqa h5=%xmm11 +movdqa %xmm6,%xmm11 + +# qhasm: h6 = h6 ^ h18 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm6,%xmm10,%xmm10 + +# qhasm: h8 = h8 ^ h18 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h9 = h9 ^ h18 +# asm 1: vpxor h9=reg128#7 +# asm 2: vpxor h9=%xmm6 +vpxor %xmm6,%xmm7,%xmm6 + +# qhasm: h17 = mem128[ ptr + 336 ] +# asm 1: movdqu 336(h17=reg128#8 +# asm 2: movdqu 336(h17=%xmm7 +movdqu 336(%r8),%xmm7 + +# qhasm: h17 = h17 ^ mem128[ ptr + 544 ] +# asm 1: vpxor 544(h17=reg128#8 +# asm 2: vpxor 544(h17=%xmm7 +vpxor 544(%r8),%xmm7,%xmm7 + +# qhasm: h4 = h17 +# asm 1: movdqa h4=reg128#13 +# asm 2: movdqa h4=%xmm12 +movdqa %xmm7,%xmm12 + +# qhasm: h5 = h5 ^ h17 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm7,%xmm11,%xmm11 + +# qhasm: h7 = h7 ^ h17 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm7,%xmm9,%xmm9 + +# qhasm: h8 = h8 ^ h17 +# asm 1: vpxor h8=reg128#8 +# asm 2: vpxor h8=%xmm7 +vpxor %xmm7,%xmm8,%xmm7 + +# qhasm: h16 = mem128[ ptr + 304 ] +# asm 1: movdqu 304(h16=reg128#9 +# asm 2: movdqu 304(h16=%xmm8 +movdqu 304(%r8),%xmm8 + +# qhasm: h16 = h16 ^ mem128[ ptr + 512 ] +# asm 1: vpxor 512(h16=reg128#9 +# asm 2: vpxor 512(h16=%xmm8 +vpxor 512(%r8),%xmm8,%xmm8 + +# qhasm: h3 = h16 +# asm 1: movdqa h3=reg128#14 +# asm 2: movdqa h3=%xmm13 +movdqa %xmm8,%xmm13 + +# qhasm: h4 = h4 ^ h16 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm8,%xmm12,%xmm12 + +# qhasm: h6 = h6 ^ h16 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm8,%xmm10,%xmm10 + +# qhasm: h7 = h7 ^ h16 +# asm 1: vpxor h7=reg128#9 +# asm 2: vpxor h7=%xmm8 +vpxor %xmm8,%xmm9,%xmm8 + +# qhasm: h15 = h15 ^ mem128[ ptr + 272 ] +# asm 1: vpxor 272(h15=reg128#1 +# asm 2: vpxor 272(h15=%xmm0 +vpxor 272(%r8),%xmm0,%xmm0 + +# qhasm: h15 = h15 ^ mem128[ ptr + 480 ] +# asm 1: vpxor 480(h15=reg128#1 +# asm 2: vpxor 480(h15=%xmm0 +vpxor 480(%r8),%xmm0,%xmm0 + +# qhasm: h2 = h15 +# asm 1: movdqa h2=reg128#10 +# asm 2: movdqa h2=%xmm9 +movdqa %xmm0,%xmm9 + +# qhasm: h3 = h3 ^ h15 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm0,%xmm13,%xmm13 + +# qhasm: h5 = h5 ^ h15 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm0,%xmm11,%xmm11 + +# qhasm: h6 = h6 ^ h15 +# asm 1: vpxor h6=reg128#1 +# asm 2: vpxor h6=%xmm0 +vpxor %xmm0,%xmm10,%xmm0 + +# qhasm: h14 = h14 ^ mem128[ ptr + 240 ] +# asm 1: vpxor 240(h14=reg128#4 +# asm 2: vpxor 240(h14=%xmm3 +vpxor 240(%r8),%xmm3,%xmm3 + +# qhasm: h14 = h14 ^ mem128[ ptr + 448 ] +# asm 1: vpxor 448(h14=reg128#4 +# asm 2: vpxor 448(h14=%xmm3 +vpxor 448(%r8),%xmm3,%xmm3 + +# qhasm: h1 = h14 +# asm 1: movdqa h1=reg128#11 +# asm 2: movdqa h1=%xmm10 +movdqa %xmm3,%xmm10 + +# qhasm: h2 = h2 ^ h14 +# asm 1: vpxor h2=reg128#10 +# asm 2: vpxor h2=%xmm9 +vpxor %xmm3,%xmm9,%xmm9 + +# qhasm: h4 = h4 ^ h14 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm3,%xmm12,%xmm12 + +# qhasm: h5 = h5 ^ h14 +# asm 1: vpxor h5=reg128#4 +# asm 2: vpxor h5=%xmm3 +vpxor %xmm3,%xmm11,%xmm3 + +# qhasm: h13 = h13 ^ mem128[ ptr + 208 ] +# asm 1: vpxor 208(h13=reg128#5 +# asm 2: vpxor 208(h13=%xmm4 +vpxor 208(%r8),%xmm4,%xmm4 + +# qhasm: h13 = h13 ^ mem128[ ptr + 416 ] +# asm 1: vpxor 416(h13=reg128#5 +# asm 2: vpxor 416(h13=%xmm4 +vpxor 416(%r8),%xmm4,%xmm4 + +# qhasm: h0 = h13 +# asm 1: movdqa h0=reg128#12 +# asm 2: movdqa h0=%xmm11 +movdqa %xmm4,%xmm11 + +# qhasm: h1 = h1 ^ h13 +# asm 1: vpxor h1=reg128#11 +# asm 2: vpxor h1=%xmm10 +vpxor %xmm4,%xmm10,%xmm10 + +# qhasm: h3 = h3 ^ h13 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm4,%xmm13,%xmm13 + +# qhasm: h4 = h4 ^ h13 +# asm 1: vpxor h4=reg128#5 +# asm 2: vpxor h4=%xmm4 +vpxor %xmm4,%xmm12,%xmm4 + +# qhasm: h12 = h12 ^ mem128[ ptr + 384 ] +# asm 1: vpxor 384(h12=reg128#3 +# asm 2: vpxor 384(h12=%xmm2 +vpxor 384(%r8),%xmm2,%xmm2 + +# qhasm: h12 = h12 ^ mem128[ ptr + 176 ] +# asm 1: vpxor 176(h12=reg128#3 +# asm 2: vpxor 176(h12=%xmm2 +vpxor 176(%r8),%xmm2,%xmm2 + +# qhasm: mem128[ input_0 + 192 ] = h12 +# asm 1: movdqu h11=reg128#2 +# asm 2: vpxor 352(h11=%xmm1 +vpxor 352(%r8),%xmm1,%xmm1 + +# qhasm: h11 = h11 ^ mem128[ ptr + 144 ] +# asm 1: vpxor 144(h11=reg128#2 +# asm 2: vpxor 144(h11=%xmm1 +vpxor 144(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 176 ] = h11 +# asm 1: movdqu h10=reg128#2 +# asm 2: vpxor 320(h10=%xmm1 +vpxor 320(%r8),%xmm5,%xmm1 + +# qhasm: h10 = h10 ^ mem128[ ptr + 112 ] +# asm 1: vpxor 112(h10=reg128#2 +# asm 2: vpxor 112(h10=%xmm1 +vpxor 112(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 160 ] = h10 +# asm 1: movdqu h9=reg128#2 +# asm 2: vpxor 288(h9=%xmm1 +vpxor 288(%r8),%xmm6,%xmm1 + +# qhasm: h9 = h9 ^ mem128[ ptr + 80 ] +# asm 1: vpxor 80(h9=reg128#2 +# asm 2: vpxor 80(h9=%xmm1 +vpxor 80(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 144 ] = h9 +# asm 1: movdqu h8=reg128#2 +# asm 2: vpxor 256(h8=%xmm1 +vpxor 256(%r8),%xmm7,%xmm1 + +# qhasm: h8 = h8 ^ mem128[ ptr + 48 ] +# asm 1: vpxor 48(h8=reg128#2 +# asm 2: vpxor 48(h8=%xmm1 +vpxor 48(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 128 ] = h8 +# asm 1: movdqu h7=reg128#2 +# asm 2: vpxor 224(h7=%xmm1 +vpxor 224(%r8),%xmm8,%xmm1 + +# qhasm: h7 = h7 ^ mem128[ ptr + 16 ] +# asm 1: vpxor 16(h7=reg128#2 +# asm 2: vpxor 16(h7=%xmm1 +vpxor 16(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = h7 +# asm 1: movdqu h6=reg128#1 +# asm 2: vpxor 192(h6=%xmm0 +vpxor 192(%r8),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 96 ] = h6 +# asm 1: movdqu h5=reg128#1 +# asm 2: vpxor 160(h5=%xmm0 +vpxor 160(%r8),%xmm3,%xmm0 + +# qhasm: mem128[ input_0 + 80 ] = h5 +# asm 1: movdqu h4=reg128#1 +# asm 2: vpxor 128(h4=%xmm0 +vpxor 128(%r8),%xmm4,%xmm0 + +# qhasm: mem128[ input_0 + 64 ] = h4 +# asm 1: movdqu h3=reg128#1 +# asm 2: vpxor 96(h3=%xmm0 +vpxor 96(%r8),%xmm13,%xmm0 + +# qhasm: mem128[ input_0 + 48 ] = h3 +# asm 1: movdqu h2=reg128#1 +# asm 2: vpxor 64(h2=%xmm0 +vpxor 64(%r8),%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 32 ] = h2 +# asm 1: movdqu h1=reg128#1 +# asm 2: vpxor 32(h1=%xmm0 +vpxor 32(%r8),%xmm10,%xmm0 + +# qhasm: mem128[ input_0 + 16 ] = h1 +# asm 1: movdqu h0=reg128#1 +# asm 2: vpxor 0(h0=%xmm0 +vpxor 0(%r8),%xmm11,%xmm0 + +# qhasm: mem128[ input_0 + 0 ] = h0 +# asm 1: movdqu + +typedef __m256i vec256; + +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_set1_16b(uint16_t a); +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_setzero(void); +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_set4x(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); + +// Extract requires a literal argument so need to be macros +#define PQCLEAN_MCELIECE6688128_AVX_vec256_extract2x(a,i) ((vec128) _mm256_extractf128_si256((vec256) (a),(i))) +#define PQCLEAN_MCELIECE6688128_AVX_vec256_extract(a,i) ((uint64_t) _mm256_extract_epi64((vec256) (a),(i))) + +int PQCLEAN_MCELIECE6688128_AVX_vec256_testz(vec256 a); +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_and(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_xor(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_or(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_sll_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_srl_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_low(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_high(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_low_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_unpack_high_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128_AVX_vec256_or_reduce(const vec256 *a); +void PQCLEAN_MCELIECE6688128_AVX_vec256_copy(vec256 *dest, const vec256 *src); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6688128_AVX_vec256_mul(vec256 *h, vec256 *f, const vec256 *g); +void PQCLEAN_MCELIECE6688128_AVX_vec256_sq(vec256 * /*out*/, const vec256 * /*in*/); +void PQCLEAN_MCELIECE6688128_AVX_vec256_inv(vec256 * /*out*/, const vec256 * /*in*/); + +extern void PQCLEAN_MCELIECE6688128_AVX_vec256_maa_asm(vec256 *, vec256 *, const vec256 *); +extern void PQCLEAN_MCELIECE6688128_AVX_vec256_ama_asm(vec256 *, vec256 *, const vec256 *); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/vec256_ama_asm.S b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/vec256_ama_asm.S new file mode 100644 index 0000000000..9a93f3da9b --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128_avx/vec256_ama_asm.S @@ -0,0 +1,2322 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 a7 + +# qhasm: reg256 a8 + +# qhasm: reg256 a9 + +# qhasm: reg256 a10 + +# qhasm: reg256 a11 + +# qhasm: reg256 a12 + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: enter vec256_ama_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6688128_AVX_vec256_ama_asm +.global PQCLEAN_MCELIECE6688128_AVX_vec256_ama_asm +_PQCLEAN_MCELIECE6688128_AVX_vec256_ama_asm: +PQCLEAN_MCELIECE6688128_AVX_vec256_ama_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rdi),%ymm1 + +# qhasm: a12 = a12 ^ mem256[ input_1 + 384 ] +# asm 1: vpxor 384(a12=reg256#2 +# asm 2: vpxor 384(a12=%ymm1 +vpxor 384(%rsi),%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 384 ] = a12 +# asm 1: vmovupd r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rdi),%ymm14 + +# qhasm: a11 = a11 ^ mem256[ input_1 + 352 ] +# asm 1: vpxor 352(a11=reg256#15 +# asm 2: vpxor 352(a11=%ymm14 +vpxor 352(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 352 ] = a11 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rdi),%ymm14 + +# qhasm: a10 = a10 ^ mem256[ input_1 + 320 ] +# asm 1: vpxor 320(a10=reg256#15 +# asm 2: vpxor 320(a10=%ymm14 +vpxor 320(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 320 ] = a10 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rdi),%ymm14 + +# qhasm: a9 = a9 ^ mem256[ input_1 + 288 ] +# asm 1: vpxor 288(a9=reg256#15 +# asm 2: vpxor 288(a9=%ymm14 +vpxor 288(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 288 ] = a9 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rdi),%ymm14 + +# qhasm: a8 = a8 ^ mem256[ input_1 + 256 ] +# asm 1: vpxor 256(a8=reg256#15 +# asm 2: vpxor 256(a8=%ymm14 +vpxor 256(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 256 ] = a8 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rdi),%ymm14 + +# qhasm: a7 = a7 ^ mem256[ input_1 + 224 ] +# asm 1: vpxor 224(a7=reg256#15 +# asm 2: vpxor 224(a7=%ymm14 +vpxor 224(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 224 ] = a7 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rdi),%ymm14 + +# qhasm: a6 = a6 ^ mem256[ input_1 + 192 ] +# asm 1: vpxor 192(a6=reg256#15 +# asm 2: vpxor 192(a6=%ymm14 +vpxor 192(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 192 ] = a6 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rdi),%ymm14 + +# qhasm: a5 = a5 ^ mem256[ input_1 + 160 ] +# asm 1: vpxor 160(a5=reg256#15 +# asm 2: vpxor 160(a5=%ymm14 +vpxor 160(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 160 ] = a5 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rdi),%ymm14 + +# qhasm: a4 = a4 ^ mem256[ input_1 + 128 ] +# asm 1: vpxor 128(a4=reg256#15 +# asm 2: vpxor 128(a4=%ymm14 +vpxor 128(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 128 ] = a4 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rdi),%ymm14 + +# qhasm: a3 = a3 ^ mem256[ input_1 + 96 ] +# asm 1: vpxor 96(a3=reg256#15 +# asm 2: vpxor 96(a3=%ymm14 +vpxor 96(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 96 ] = a3 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rdi),%ymm14 + +# qhasm: a2 = a2 ^ mem256[ input_1 + 64 ] +# asm 1: vpxor 64(a2=reg256#15 +# asm 2: vpxor 64(a2=%ymm14 +vpxor 64(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 64 ] = a2 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rdi),%ymm14 + +# qhasm: a1 = a1 ^ mem256[ input_1 + 32 ] +# asm 1: vpxor 32(a1=reg256#15 +# asm 2: vpxor 32(a1=%ymm14 +vpxor 32(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 32 ] = a1 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rdi),%ymm14 + +# qhasm: a0 = a0 ^ mem256[ input_1 + 0 ] +# asm 1: vpxor 0(a0=reg256#15 +# asm 2: vpxor 0(a0=%ymm14 +vpxor 0(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 0 ] = a0 +# asm 1: vmovupd r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm2,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm1,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm13,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm12,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm11,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm10,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm9,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm8,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm7,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm6,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm5,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm4,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm3,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rdi),%ymm2,%ymm0 + +# qhasm: mem256[ input_0 + 384 ] = r12 +# asm 1: vmovupd r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rdi),%ymm1,%ymm0 + +# qhasm: mem256[ input_0 + 352 ] = r11 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rdi),%ymm13,%ymm0 + +# qhasm: mem256[ input_0 + 320 ] = r10 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rdi),%ymm12,%ymm0 + +# qhasm: mem256[ input_0 + 288 ] = r9 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rdi),%ymm11,%ymm0 + +# qhasm: mem256[ input_0 + 256 ] = r8 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rdi),%ymm10,%ymm0 + +# qhasm: mem256[ input_0 + 224 ] = r7 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rdi),%ymm9,%ymm0 + +# qhasm: mem256[ input_0 + 192 ] = r6 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rdi),%ymm8,%ymm0 + +# qhasm: mem256[ input_0 + 160 ] = r5 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rdi),%ymm7,%ymm0 + +# qhasm: mem256[ input_0 + 128 ] = r4 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rdi),%ymm6,%ymm0 + +# qhasm: mem256[ input_0 + 96 ] = r3 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rdi),%ymm5,%ymm0 + +# qhasm: mem256[ input_0 + 64 ] = r2 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rdi),%ymm4,%ymm0 + +# qhasm: mem256[ input_0 + 32 ] = r1 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rdi),%ymm3,%ymm0 + +# qhasm: mem256[ input_0 + 0 ] = r0 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t0 = mem64[ input_0 + 192 ] +# asm 1: movq 192(t0=int64#2 +# asm 2: movq 192(t0=%rsi +movq 192(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(t1=int64#3 +# asm 2: movq 200(t1=%rdx +movq 200(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 176(t0=%rsi +movq 176(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(t1=int64#3 +# asm 2: movq 184(t1=%rdx +movq 184(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 160(t0=%rsi +movq 160(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(t1=int64#3 +# asm 2: movq 168(t1=%rdx +movq 168(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 144(t0=%rsi +movq 144(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(t1=int64#3 +# asm 2: movq 152(t1=%rdx +movq 152(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 128(t0=%rsi +movq 128(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(t1=int64#3 +# asm 2: movq 136(t1=%rdx +movq 136(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 112(t0=%rsi +movq 112(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(t1=int64#3 +# asm 2: movq 120(t1=%rdx +movq 120(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 96(t0=%rsi +movq 96(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(t1=int64#3 +# asm 2: movq 104(t1=%rdx +movq 104(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 80(t0=%rsi +movq 80(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(t1=int64#3 +# asm 2: movq 88(t1=%rdx +movq 88(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 64(t0=%rsi +movq 64(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(t1=int64#3 +# asm 2: movq 72(t1=%rdx +movq 72(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 48(t0=%rsi +movq 48(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(t1=int64#3 +# asm 2: movq 56(t1=%rdx +movq 56(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 32(t0=%rsi +movq 32(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(t1=int64#3 +# asm 2: movq 40(t1=%rdx +movq 40(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 16(t0=%rsi +movq 16(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(t1=int64#3 +# asm 2: movq 24(t1=%rdx +movq 24(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 0(t0=%rsi +movq 0(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(t1=int64#1 +# asm 2: movq 8(t1=%rdi +movq 8(%rdi),%rdi + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rsi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE6688128F_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/api.h b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/api.h new file mode 100644 index 0000000000..40dcac7a65 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_API_H +#define PQCLEAN_MCELIECE6688128F_AVX_API_H + +#include + +#define PQCLEAN_MCELIECE6688128F_AVX_CRYPTO_ALGNAME "Classic McEliece 6688128" +#define PQCLEAN_MCELIECE6688128F_AVX_CRYPTO_PUBLICKEYBYTES 1044992 +#define PQCLEAN_MCELIECE6688128F_AVX_CRYPTO_SECRETKEYBYTES 13892 +#define PQCLEAN_MCELIECE6688128F_AVX_CRYPTO_CIPHERTEXTBYTES 240 +#define PQCLEAN_MCELIECE6688128F_AVX_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE6688128F_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE6688128F_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE6688128F_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/benes.c b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/benes.c new file mode 100644 index 0000000000..50f1c535bb --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/benes.c @@ -0,0 +1,311 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_x(vec128 *data, vec128 *bits) { + int i; + vec128 v0, v1; + vec128 d; + + for (i = 0; i < 64; i += 2) { + v0 = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_low(data[i + 0], data[i + 1]); + v1 = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_high(data[i + 0], data[i + 1]); + + d = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(v0, v1); + d = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(d, *bits++); + v0 = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(v0, d); + v1 = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(v1, d); + + data[i + 0] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_low(v0, v1); + data[i + 1] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_high(v0, v1); + } +} + +static void layer_0(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x ], bs[ x + 1 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, *cond++); + bs[ x ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x ], diff); + bs[ x + 1 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 1 ], diff); + } +} + +static void layer_1(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 2 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 2 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 3 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 3 ], diff); + + cond += 2; + } +} + +static void layer_2(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 4 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 4 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 5 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 5 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[2]); + bs[ x + 2 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 2 ], diff); + bs[ x + 6 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 6 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[3]); + bs[ x + 3 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 3 ], diff); + bs[ x + 7 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ x + 7 ], diff); + + cond += 4; + } +} + +static void layer_3(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 8 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 8 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 9 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 9 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 10 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 10 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 11 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 11 ], diff); + + cond += 4; + } + } +} + +static void layer_4(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 16 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 16 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 17 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 17 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 18 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 18 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 19 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 19 ], diff); + + cond += 4; + } + } +} + +static void layer_5(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 32 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 32 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 33 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 33 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 34 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 34 ], diff); + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); + diff = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 35 ] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(bs[ s + 35 ], diff); + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: bits_int, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE6688128F_AVX_load_bits(vec128 bits_int[][32], const unsigned char *bits) { + int i, j; + const unsigned char *ptr = bits; + + vec128 buf[64]; + + for (i = 0; i <= 5; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(PQCLEAN_MCELIECE6688128F_AVX_load8(ptr), PQCLEAN_MCELIECE6688128F_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE6688128F_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } + + for (i = 6; i <= 18; i++) { + for (j = 0; j < 32; j++) { + bits_int[i][j] = PQCLEAN_MCELIECE6688128F_AVX_load16(ptr); + ptr += 16; + } + } + + for (i = 19; i < 25; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(PQCLEAN_MCELIECE6688128F_AVX_load8(ptr), PQCLEAN_MCELIECE6688128F_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE6688128F_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } +} + +/* input: r, sequence of bits to be permuted */ +/* b, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE6688128F_AVX_benes(vec128 *r, vec128 b[][32], int rev) { + int inc; + + vec128 *b_ptr; + + if (rev == 0) { + inc = 32; + b_ptr = b[ 0]; + } else { + inc = -32; + b_ptr = b[24]; + } + + // + + PQCLEAN_MCELIECE6688128F_AVX_transpose_64x128_sp( r ); + + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE6688128F_AVX_transpose_64x128_sp( r ); + + layer_x(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_x(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE6688128F_AVX_transpose_64x128_sp( r ); + + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + //b_ptr += inc; + + PQCLEAN_MCELIECE6688128F_AVX_transpose_64x128_sp( r ); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/benes.h b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/benes.h new file mode 100644 index 0000000000..905e5eac9b --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_BENES_H +#define PQCLEAN_MCELIECE6688128F_AVX_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE6688128F_AVX_load_bits(vec128 /*bits_int*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE6688128F_AVX_benes(vec128 * /*r*/, vec128 /*b*/[][32], int /*rev*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/bm.c b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/bm.c new file mode 100644 index 0000000000..20b8d80864 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/bm.c @@ -0,0 +1,219 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "params.h" +#include "vec128.h" + +#include + +extern gf PQCLEAN_MCELIECE6688128F_AVX_vec_reduce_asm(vec128 *); +extern void PQCLEAN_MCELIECE6688128F_AVX_update_asm(void *, gf, int); + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline void vec128_cmov(vec128 out[][2], uint16_t mask) { + int i; + + vec128 v0, v1; + + vec128 m0 = PQCLEAN_MCELIECE6688128F_AVX_vec128_set1_16b( mask); + vec128 m1 = PQCLEAN_MCELIECE6688128F_AVX_vec128_set1_16b(~mask); + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(out[i][1], m0); + v1 = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(out[i][0], m1); + out[i][0] = PQCLEAN_MCELIECE6688128F_AVX_vec128_or(v0, v1); + } +} + +static inline void interleave(vec256 *in, int idx0, int idx1, vec256 *mask, int b) { + int s = 1 << b; + + vec256 x, y; + + x = PQCLEAN_MCELIECE6688128F_AVX_vec256_or(PQCLEAN_MCELIECE6688128F_AVX_vec256_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE6688128F_AVX_vec256_sll_4x(PQCLEAN_MCELIECE6688128F_AVX_vec256_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE6688128F_AVX_vec256_or(PQCLEAN_MCELIECE6688128F_AVX_vec256_srl_4x(PQCLEAN_MCELIECE6688128F_AVX_vec256_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE6688128F_AVX_vec256_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec256 *in) { + int i, k; + + vec256 mask[4][2]; + vec256 buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE6688128F_AVX_vec256_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(buf[i], 1) >> (k * 16)) & GFMASK; + out[ (4 * 2 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(buf[i], 2) >> (k * 16)) & GFMASK; + out[ (4 * 3 + k) * 16 + i ] = (PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(buf[i], 3) >> (k * 16)) & GFMASK; + } + } +} + +typedef union { + vec128 as_128[GFBITS][2]; + vec256 as_256[GFBITS]; +} aligned_double_vec128; + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE6688128F_AVX_bm(vec128 *out, vec256 *in) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1, t; + + vec128 prod[ GFBITS ]; + vec128 interval[GFBITS]; + + aligned_double_vec128 db; + aligned_double_vec128 BC_tmp; + aligned_double_vec128 BC; + + gf d, b, c0 = 1; + gf coefs[256]; + + // initialization + + get_coefs(coefs, in); + + BC.as_128[0][0] = PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0, one << 63); + BC.as_128[0][1] = PQCLEAN_MCELIECE6688128F_AVX_vec128_setzero(); + + for (i = 1; i < GFBITS; i++) { + BC.as_128[i][0] = BC.as_128[i][1] = PQCLEAN_MCELIECE6688128F_AVX_vec128_setzero(); + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[i] = PQCLEAN_MCELIECE6688128F_AVX_vec128_setzero(); + } + + for (N = 0; N < 256; N++) { + PQCLEAN_MCELIECE6688128F_AVX_vec128_mul_asm(prod, interval, BC.as_128[0] + 1, 32); + PQCLEAN_MCELIECE6688128F_AVX_update_asm(interval, coefs[N], 16); + + d = PQCLEAN_MCELIECE6688128F_AVX_vec_reduce_asm(prod); + + t = PQCLEAN_MCELIECE6688128F_AVX_gf_mul2(c0, coefs[N], b); + d ^= t & 0xFFFFFFFF; + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + db.as_128[i][0] = PQCLEAN_MCELIECE6688128F_AVX_vec128_setbits((d >> i) & 1); + db.as_128[i][1] = PQCLEAN_MCELIECE6688128F_AVX_vec128_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec256_mul(BC_tmp.as_256, db.as_256, BC.as_256); + + vec128_cmov(BC.as_128, mask); + PQCLEAN_MCELIECE6688128F_AVX_update_asm(BC.as_128, c0 & mask, 32); + + for (i = 0; i < GFBITS; i++) { + BC.as_128[i][1] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(BC_tmp.as_128[i][0], BC_tmp.as_128[i][1]); + } + + c0 = t >> 32; + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + c0 = PQCLEAN_MCELIECE6688128F_AVX_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + prod[i] = PQCLEAN_MCELIECE6688128F_AVX_vec128_setbits((c0 >> i) & 1); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec128_mul_asm(out, prod, BC.as_128[0] + 1, 32); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/bm.h b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/bm.h new file mode 100644 index 0000000000..a4b6d9b2dc --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/bm.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_BM_H +#define PQCLEAN_MCELIECE6688128F_AVX_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "vec128.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE6688128F_AVX_bm(vec128 * /*out*/, vec256 * /*in*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/consts.S b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/consts.S new file mode 100644 index 0000000000..f9127c38d5 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/consts.S @@ -0,0 +1,33 @@ +.data + +# not supported on MacOS +#.section .rodata + +.globl PQCLEAN_MCELIECE6688128F_AVX_MASK0_0 +.globl PQCLEAN_MCELIECE6688128F_AVX_MASK0_1 +.globl PQCLEAN_MCELIECE6688128F_AVX_MASK1_0 +.globl PQCLEAN_MCELIECE6688128F_AVX_MASK1_1 +.globl PQCLEAN_MCELIECE6688128F_AVX_MASK2_0 +.globl PQCLEAN_MCELIECE6688128F_AVX_MASK2_1 +.globl PQCLEAN_MCELIECE6688128F_AVX_MASK3_0 +.globl PQCLEAN_MCELIECE6688128F_AVX_MASK3_1 +.globl PQCLEAN_MCELIECE6688128F_AVX_MASK4_0 +.globl PQCLEAN_MCELIECE6688128F_AVX_MASK4_1 +.globl PQCLEAN_MCELIECE6688128F_AVX_MASK5_0 +.globl PQCLEAN_MCELIECE6688128F_AVX_MASK5_1 + +.p2align 5 + +PQCLEAN_MCELIECE6688128F_AVX_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE6688128F_AVX_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE6688128F_AVX_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333, 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE6688128F_AVX_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE6688128F_AVX_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE6688128F_AVX_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE6688128F_AVX_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE6688128F_AVX_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE6688128F_AVX_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE6688128F_AVX_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE6688128F_AVX_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE6688128F_AVX_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/consts.inc b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/consts.inc new file mode 100644 index 0000000000..0415c18e13 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/consts.inc @@ -0,0 +1,502 @@ +//64 +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6666666666666666, 0X6666666666666666, 0X6666666666666666, 0X6666666666666666), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A) +}, +//128 +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33, 0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF) +}, +//256 +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0XF00F0FF00FF0F00F, 0XF00F0FF00FF0F00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9669966969966996, 0X6996699696699669, 0X6996699696699669, 0X9669966969966996), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0X5AA5A55AA55A5AA5, 0X5AA5A55AA55A5AA5, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555) +}, +//512 +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9966669966999966, 0X6699996699666699, 0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9966996699669966, 0X6699669966996699, 0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6699996699666699, 0X9966669966999966, 0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6699669966996699, 0X9966996699669966, 0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +//1024 +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +//2048 +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +//4096 +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/controlbits.c b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/controlbits.c new file mode 100644 index 0000000000..0a015282c0 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE6688128F_AVX_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE6688128F_AVX_sort_63b(n / 2, x); + PQCLEAN_MCELIECE6688128F_AVX_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE6688128F_AVX_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/controlbits.h b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/controlbits.h new file mode 100644 index 0000000000..90f8ea6e88 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_CONTROLBITS_H +#define PQCLEAN_MCELIECE6688128F_AVX_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE6688128F_AVX_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE6688128F_AVX_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/crypto_hash.h b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/crypto_hash.h new file mode 100644 index 0000000000..b8783575ac --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE6688128F_AVX_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/decrypt.c b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/decrypt.c new file mode 100644 index 0000000000..5e29bfc0f5 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/decrypt.c @@ -0,0 +1,234 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec256 out[][GFBITS], vec256 inv[][GFBITS], const unsigned char *sk, vec256 *recv) { + int i, j; + + vec128 sk_int[ GFBITS ]; + vec256 eval[32][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + // computing inverses + + PQCLEAN_MCELIECE6688128F_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE6688128F_AVX_fft(eval, sk_int); + + for (i = 0; i < 32; i++) { + PQCLEAN_MCELIECE6688128F_AVX_vec256_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec256_copy(inv[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE6688128F_AVX_vec256_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec256_inv(tmp, inv[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE6688128F_AVX_vec256_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE6688128F_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec256_copy(inv[0], tmp); + + // + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE6688128F_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + unsigned char r[ 1024 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + for (i = SYND_BYTES; i < 1024; i++) { + r[i] = 0; + } + + for (i = 0; i < 64; i++) { + recv[i] = PQCLEAN_MCELIECE6688128F_AVX_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 64; i++) { + v[0] = PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE6688128F_AVX_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE6688128F_AVX_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec256 out[][GFBITS], vec256 inv[][GFBITS], vec256 *recv) { + int i, j; + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE6688128F_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 64; i++) { + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u64( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec256 *s0, vec256 *s1) { + int i; + vec256 diff; + + diff = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(s0[0], s1[0]); + + for (i = 1; i < GFBITS; i++) { + diff = PQCLEAN_MCELIECE6688128F_AVX_vec256_or(diff, PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(s0[i], s1[i])); + } + + return (uint16_t)PQCLEAN_MCELIECE6688128F_AVX_vec256_testz(diff); +} + +static void reformat_128to256(vec256 *out, vec128 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(in[2 * i + 0], 0); + v[1] = PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(in[2 * i + 0], 1); + v[2] = PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(in[2 * i + 1], 0); + v[3] = PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(in[2 * i + 1], 1); + + out[i] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +static void reformat_256to128(vec128 *out, vec256 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i], 3); + + out[2 * i + 0] = PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(v[0], v[1]); + out[2 * i + 1] = PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(v[2], v[3]); + } +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE6688128F_AVX_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec256 inv[ 32 ][ GFBITS ]; + vec256 scaled[ 32 ][ GFBITS ]; + vec256 eval[32][ GFBITS ]; + + vec128 error128[ 64 ]; + vec256 error256[ 32 ]; + + vec256 s_priv[ GFBITS ]; + vec256 s_priv_cmp[ GFBITS ]; + vec128 locator[ GFBITS ]; + + vec128 recv128[ 64 ]; + vec256 recv256[ 32 ]; + vec256 allone; + + vec128 bits_int[25][32]; + + // Berlekamp decoder + + preprocess(recv128, c); + + PQCLEAN_MCELIECE6688128F_AVX_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE6688128F_AVX_benes(recv128, bits_int, 1); + + reformat_128to256(recv256, recv128); + + scaling(scaled, inv, sk, recv256); + PQCLEAN_MCELIECE6688128F_AVX_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE6688128F_AVX_bm(locator, s_priv); + + PQCLEAN_MCELIECE6688128F_AVX_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE6688128F_AVX_vec256_set1_16b(0xFFFF); + + for (i = 0; i < 32; i++) { + error256[i] = PQCLEAN_MCELIECE6688128F_AVX_vec256_or_reduce(eval[i]); + error256[i] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(error256[i], allone); + } + + scaling_inv(scaled, inv, error256); + PQCLEAN_MCELIECE6688128F_AVX_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + reformat_256to128(error128, error256); + PQCLEAN_MCELIECE6688128F_AVX_benes(error128, bits_int, 0); + + postprocess(e, error128); + + check_weight = weight_check(e, error128); + + return 1 - (check_synd & check_weight); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/decrypt.h b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/decrypt.h new file mode 100644 index 0000000000..229dce4b15 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_DECRYPT_H +#define PQCLEAN_MCELIECE6688128F_AVX_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE6688128F_AVX_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/encrypt.c b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/encrypt.c new file mode 100644 index 0000000000..3859648ebd --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/encrypt.c @@ -0,0 +1,104 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "int32_sort.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE6688128F_AVX_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + int i, j, eq, count; + + uint16_t ind[ SYS_T * 2 ]; + int32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((unsigned char *) ind, sizeof(ind)); + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + PQCLEAN_MCELIECE6688128F_AVX_int32_sort(ind32, SYS_T); + + eq = 0; + for (i = 1; i < SYS_T; i++) { + if (ind32[i - 1] == ind32[i]) { + eq = 1; + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE6688128F_AVX_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE6688128F_AVX_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + + PQCLEAN_MCELIECE6688128F_AVX_syndrome_asm(s, pk, e); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/encrypt.h b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/encrypt.h new file mode 100644 index 0000000000..ccc75600b2 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_ENCRYPT_H +#define PQCLEAN_MCELIECE6688128F_AVX_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE6688128F_AVX_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/fft.c b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/fft.c new file mode 100644 index 0000000000..344f9dd08a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/fft.c @@ -0,0 +1,275 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" +#include "vec256.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec128 *in) { + int i, j, k; + vec128 t; + uint64_t v0, v1; + + const vec128 mask[5][2] = { + { + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0x8888888888888888, 0x8888888888888888), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + } + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + v1 = PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(in[i], 1); + v1 ^= v1 >> 32; + v0 = PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(in[i], 0); + v0 ^= v1 << 32; + in[i] = PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(v0, v1); + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + t = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE6688128F_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(in[i], t); + + t = PQCLEAN_MCELIECE6688128F_AVX_vec128_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE6688128F_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(in[i], t); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE6688128F_AVX_vec128_mul(in, in, s[j]); // scaling + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec256 out[][ GFBITS ], vec128 *in) { + int i, j, k, s, b; + + vec128 tmp[ GFBITS ]; + vec256 tmp0[ GFBITS ]; + vec256 tmp1[ GFBITS ]; + vec128 t[ GFBITS ]; + + union { + vec128 v[8][ GFBITS + 1 ]; + vec256 V[8][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + uint64_t v0, v1; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + const vec256 powers[ 32 ][ GFBITS ] = { +#include "powers.inc" + }; + + uint64_t consts_ptr = 2; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; + + // boradcast + + for (j = 0; j < GFBITS; j++) { + t[j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_high(in[j], in[j]); + } + + for (i = 0; i < 8; i += 2) { + for (j = 0; j < GFBITS; j++) { + v0 = (beta[i + 0] >> j) & 1; + v0 = -v0; + v1 = (beta[i + 1] >> j) & 1; + v1 = -v1; + + tmp[j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(v0, v1); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec128_mul(tmp, t, tmp); + + for (j = 0; j < GFBITS; j++) { + pre.v[i + 0][j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_low(tmp[j], tmp[j]); + pre.v[i + 1][j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_high(tmp[j], tmp[j]); + } + } + + for (i = 0; i < GFBITS; i += 2) { + if (i != GFBITS - 1) { + buf.v[0][1] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_low(in[i + 1], in[i + 1] ^ pre.v[6][i + 1]); + } + buf.v[0][0] = PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_low(in[i + 0], in[i + 0] ^ pre.v[6][i + 0]); + + buf.V[1] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[0], pre.V[0][i / 2]); + buf.V[16] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[0], pre.V[4][i / 2]); + buf.V[3] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[1], pre.V[1][i / 2]); + buf.V[48] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[16], pre.V[5][i / 2]); + buf.V[49] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[48], pre.V[0][i / 2]); + buf.V[2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[0], pre.V[1][i / 2]); + buf.V[51] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[49], pre.V[1][i / 2]); + buf.V[6] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[2], pre.V[2][i / 2]); + buf.V[50] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[51], pre.V[0][i / 2]); + buf.V[7] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[6], pre.V[0][i / 2]); + buf.V[54] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[50], pre.V[2][i / 2]); + buf.V[5] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[7], pre.V[1][i / 2]); + buf.V[55] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[54], pre.V[0][i / 2]); + buf.V[53] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[55], pre.V[1][i / 2]); + buf.V[4] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[0], pre.V[2][i / 2]); + buf.V[52] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[53], pre.V[0][i / 2]); + buf.V[12] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[4], pre.V[3][i / 2]); + buf.V[60] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[52], pre.V[3][i / 2]); + buf.V[13] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[12], pre.V[0][i / 2]); + buf.V[61] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[60], pre.V[0][i / 2]); + buf.V[15] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[13], pre.V[1][i / 2]); + buf.V[63] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[61], pre.V[1][i / 2]); + buf.V[14] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[15], pre.V[0][i / 2]); + buf.V[62] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[63], pre.V[0][i / 2]); + buf.V[10] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[14], pre.V[2][i / 2]); + buf.V[58] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[62], pre.V[2][i / 2]); + buf.V[11] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[10], pre.V[0][i / 2]); + buf.V[59] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[58], pre.V[0][i / 2]); + buf.V[9] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[11], pre.V[1][i / 2]); + buf.V[57] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[59], pre.V[1][i / 2]); + buf.V[56] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[57], pre.V[0][i / 2]); + buf.V[8] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[0], pre.V[3][i / 2]); + buf.V[40] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[56], pre.V[4][i / 2]); + buf.V[24] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[8], pre.V[4][i / 2]); + buf.V[41] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[40], pre.V[0][i / 2]); + buf.V[25] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[24], pre.V[0][i / 2]); + buf.V[43] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[41], pre.V[1][i / 2]); + buf.V[27] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[25], pre.V[1][i / 2]); + buf.V[42] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[43], pre.V[0][i / 2]); + buf.V[26] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[27], pre.V[0][i / 2]); + buf.V[46] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[42], pre.V[2][i / 2]); + buf.V[30] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[26], pre.V[2][i / 2]); + buf.V[47] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[46], pre.V[0][i / 2]); + buf.V[31] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[30], pre.V[0][i / 2]); + buf.V[45] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[47], pre.V[1][i / 2]); + buf.V[29] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[31], pre.V[1][i / 2]); + buf.V[44] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[45], pre.V[0][i / 2]); + buf.V[28] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[29], pre.V[0][i / 2]); + buf.V[36] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[44], pre.V[3][i / 2]); + buf.V[20] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[28], pre.V[3][i / 2]); + buf.V[37] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[36], pre.V[0][i / 2]); + buf.V[21] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[20], pre.V[0][i / 2]); + buf.V[39] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[37], pre.V[1][i / 2]); + buf.V[23] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[21], pre.V[1][i / 2]); + buf.V[38] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[39], pre.V[0][i / 2]); + buf.V[22] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[23], pre.V[0][i / 2]); + buf.V[34] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[38], pre.V[2][i / 2]); + buf.V[18] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[22], pre.V[2][i / 2]); + buf.V[35] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[34], pre.V[0][i / 2]); + buf.V[19] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[18], pre.V[0][i / 2]); + buf.V[33] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[35], pre.V[1][i / 2]); + buf.V[17] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[19], pre.V[1][i / 2]); + buf.V[32] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[33], pre.V[0][i / 2]); + + + // transpose + + PQCLEAN_MCELIECE6688128F_AVX_transpose_64x256_sp(buf.V); + + for (j = 0; j < 32; j++) { + if (i != GFBITS - 1) { + out[j][i + 1] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_high(buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + out[j][i + 0] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_low (buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + } + + // butterflies + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + tmp0[b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_low (out[k][b], out[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + tmp1[b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_high (out[k][b], out[k + 1][b]); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec256_maa_asm(tmp0, tmp1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_low (tmp0[b], tmp1[b]); + } + for (b = 0; b < GFBITS; b++) { + out[k + 1][b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_high (tmp0[b], tmp1[b]); + } + } + + for (i = 0; i <= 4; i++) { + s = 1 << i; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE6688128F_AVX_vec256_maa_asm(out[k], out[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + consts_ptr += (1 << i); + } + + // adding the part contributed by x^128 + + for (i = 0; i < 32; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(out[i][b], powers[i][b]); + } + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE6688128F_AVX_fft(vec256 out[][GFBITS], vec128 *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/fft.h b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/fft.h new file mode 100644 index 0000000000..5ecdf44abb --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/fft.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_FFT_H +#define PQCLEAN_MCELIECE6688128F_AVX_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include "params.h" +#include "vec128.h" +#include "vec256.h" +#include + +void PQCLEAN_MCELIECE6688128F_AVX_fft(vec256 /*out*/[][GFBITS], vec128 * /*in*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/fft_tr.c b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/fft_tr.c new file mode 100644 index 0000000000..504d6a010f --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/fft_tr.c @@ -0,0 +1,379 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec256 *in) { + int i, j, k; + vec256 t; + uint64_t v[4]; + + const vec256 mask[6][2] = { + { + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0x2222222222222222, 0x2222222222222222, 0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0x4444444444444444, 0x4444444444444444, 0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0x3030303030303030, 0x3030303030303030, 0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000) + }, + { + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF) + } + }; + + const vec256 s[6][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE6688128F_AVX_vec256_mul(in, in, s[j]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + t = PQCLEAN_MCELIECE6688128F_AVX_vec256_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE6688128F_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(in[i], t); + + t = PQCLEAN_MCELIECE6688128F_AVX_vec256_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE6688128F_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(in[i], t); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i], 3); + + v[1] ^= v[0] >> 32; + v[1] ^= v[1] << 32; + v[3] ^= v[2] >> 32; + v[3] ^= v[3] << 32; + + in[i] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i], 3); + + v[2] ^= v[1]; + v[3] ^= v[2]; + + in[i] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + + } +} + +static void butterflies_tr(vec256 *out, vec256 in[][ GFBITS ]) { + int i, j, k, s, b; + + vec256 t0[ GFBITS ]; + vec256 t1[ GFBITS ]; + vec256 t; + + vec128 out128[ GFBITS ][ 2 ]; + vec128 tmp[ GFBITS ]; + + union { + vec128 v[6][ GFBITS + 1 ]; + vec256 V[6][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t v[4]; + uint64_t consts_ptr = 33; + + const unsigned char reversal[] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // butterflies + + for (i = 4; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE6688128F_AVX_vec256_ama_asm(in[k], in[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + } + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_low(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_high(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec256_ama_asm(t0, t1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + in[k][b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_low(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_high(t0[b], t1[b]); + } + + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_low_2x(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_high_2x(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec256_ama_asm(t0, t1, consts[0]); + + for (b = 0; b < GFBITS; b++) { + in[k + 0][b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_low_2x(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_high_2x(t0[b], t1[b]); + } + } + + + // boradcast + + for (i = 0; i < GFBITS; i += 2) { + // transpose + + for (k = 0; k < 32; k++) { + if (i != GFBITS - 1) { + buf.v[ reversal[2 * k + 0] ][1] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract2x(in[ k ][i + 1], 0); + buf.v[ reversal[2 * k + 1] ][1] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract2x(in[ k ][i + 1], 1); + } + + buf.v[ reversal[2 * k + 0] ][0] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract2x(in[ k ][i + 0], 0); + buf.v[ reversal[2 * k + 1] ][0] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract2x(in[ k ][i + 0], 1); + } + + PQCLEAN_MCELIECE6688128F_AVX_transpose_64x256_sp(buf.V); + + // + + pre.V[0][i / 2] = buf.V[32]; + buf.V[33] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[33], buf.V[32]); + pre.V[1][i / 2] = buf.V[33]; + buf.V[35] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[35], buf.V[33]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[35]); + buf.V[34] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[34], buf.V[35]); + pre.V[2][i / 2] = buf.V[34]; + buf.V[38] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[38], buf.V[34]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[38]); + buf.V[39] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[39], buf.V[38]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[39]); + buf.V[37] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[37], buf.V[39]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[37]); + buf.V[36] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[36], buf.V[37]); + pre.V[3][i / 2] = buf.V[36]; + buf.V[44] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[44], buf.V[36]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[44]); + buf.V[45] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[45], buf.V[44]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[45]); + buf.V[47] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[47], buf.V[45]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[47]); + buf.V[46] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[46], buf.V[47]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[46]); + buf.V[42] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[42], buf.V[46]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[42]); + buf.V[43] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[43], buf.V[42]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[43]); + buf.V[41] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[41], buf.V[43]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[41]); + buf.V[40] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[40], buf.V[41]); + pre.V[4][i / 2] = buf.V[40]; + buf.V[56] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[56], buf.V[40]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[56]); + buf.V[57] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[57], buf.V[56]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[57]); + buf.V[59] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[59], buf.V[57]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[59]); + buf.V[58] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[58], buf.V[59]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[58]); + buf.V[62] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[62], buf.V[58]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[62]); + buf.V[63] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[63], buf.V[62]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[63]); + buf.V[61] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[61], buf.V[63]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[61]); + buf.V[60] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[60], buf.V[61]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[60]); + buf.V[52] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[52], buf.V[60]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[52]); + buf.V[53] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[53], buf.V[52]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[53]); + buf.V[55] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[55], buf.V[53]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[55]); + buf.V[54] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[54], buf.V[55]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[54]); + buf.V[50] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[50], buf.V[54]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[50]); + buf.V[51] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[51], buf.V[50]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[51]); + buf.V[49] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[49], buf.V[51]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[49]); + buf.V[48] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[48], buf.V[49]); + pre.V[5][i / 2] = buf.V[48]; + buf.V[16] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[16], buf.V[48]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[16]); + buf.V[17] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[17], buf.V[16]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[17]); + buf.V[19] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[19], buf.V[17]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[19]); + buf.V[18] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[18], buf.V[19]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[18]); + buf.V[22] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[22], buf.V[18]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[22]); + buf.V[23] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[23], buf.V[22]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[23]); + buf.V[21] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[21], buf.V[23]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[21]); + buf.V[20] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[20], buf.V[21]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[20]); + buf.V[28] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[28], buf.V[20]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[28]); + buf.V[29] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[29], buf.V[28]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[29]); + buf.V[31] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[31], buf.V[29]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[31]); + buf.V[30] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[30], buf.V[31]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[30]); + buf.V[26] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[26], buf.V[30]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[26]); + buf.V[27] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[27], buf.V[26]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[27]); + buf.V[25] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[25], buf.V[27]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[25]); + buf.V[24] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[24], buf.V[25]); + pre.V[4][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[4][i / 2], buf.V[24]); + buf.V[8] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[8], buf.V[24]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[8]); + buf.V[9] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[9], buf.V[8]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[9]); + buf.V[11] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[11], buf.V[9]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[11]); + buf.V[10] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[10], buf.V[11]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[10]); + buf.V[14] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[14], buf.V[10]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[14]); + buf.V[15] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[15], buf.V[14]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[15]); + buf.V[13] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[13], buf.V[15]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[13]); + buf.V[12] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[12], buf.V[13]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[12]); + buf.V[4] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[4], buf.V[12]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[4]); + buf.V[5] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[5], buf.V[4]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[5]); + buf.V[7] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[7], buf.V[5]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[7]); + buf.V[6] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[6], buf.V[7]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[6]); + buf.V[2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[2], buf.V[6]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[2]); + buf.V[3] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[3], buf.V[2]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[3]); + buf.V[1] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[1], buf.V[3]); + + pre.V[0][i / 2] = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[1]); + t = PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(buf.V[0], buf.V[1]); + + if (i != GFBITS - 1) { + out128[i + 1][0] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract2x(t, 1); + } + out128[i + 0][0] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract2x(t, 0); + + } + + // + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec128_mul(tmp, pre.v[0], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = tmp[b]; + } + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6688128F_AVX_vec128_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec128_mul(tmp, pre.v[i], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(out128[b][1], tmp[b]); + } + } + + for (b = 0; b < GFBITS; b++) { + v[0] = PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(out128[b][0], 0); + v[1] = PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(out128[b][0], 1); + v[2] = PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(out128[b][1], 0); + v[3] = PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(out128[b][1], 1); + + out[b] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +void PQCLEAN_MCELIECE6688128F_AVX_fft_tr(vec256 *out, vec256 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/fft_tr.h b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/fft_tr.h new file mode 100644 index 0000000000..210cc82b66 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_FFT_TR_H +#define PQCLEAN_MCELIECE6688128F_AVX_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE6688128F_AVX_fft_tr(vec256 * /*out*/, vec256 /*in*/[][ GFBITS ]); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/gf.c b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/gf.c new file mode 100644 index 0000000000..bba70c12e4 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/gf.c @@ -0,0 +1,236 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +#include + +/* field multiplication */ +gf PQCLEAN_MCELIECE6688128F_AVX_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE6688128F_AVX_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x01FF000001FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x0000E0000000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & 0x00001FFF00001FFF; +} + +/* 2 field squarings */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE6688128F_AVX_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // 11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // 1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // 11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // 111111111111 + + return gf_sqmul(out, num); // 1111111111110 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE6688128F_AVX_gf_inv(gf in) { + return PQCLEAN_MCELIECE6688128F_AVX_gf_frac(in, ((gf) 1)); +} + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE6688128F_AVX_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE6688128F_AVX_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[255]; + + for (i = 0; i < 255; i++) { + prod[i] = 0; + } + + for (i = 0; i < 128; i++) { + for (j = 0; j < 128; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE6688128F_AVX_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 254; i >= 128; i--) { + prod[i - 123] ^= PQCLEAN_MCELIECE6688128F_AVX_gf_mul(prod[i], (gf) 7682); + prod[i - 125] ^= PQCLEAN_MCELIECE6688128F_AVX_gf_mul(prod[i], (gf) 2159); + prod[i - 128] ^= PQCLEAN_MCELIECE6688128F_AVX_gf_mul(prod[i], (gf) 3597); + } + + for (i = 0; i < 128; i++) { + out[i] = prod[i]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/gf.h b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/gf.h new file mode 100644 index 0000000000..e92a9c54a1 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/gf.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_GF_H +#define PQCLEAN_MCELIECE6688128F_AVX_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE6688128F_AVX_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE6688128F_AVX_gf_mul(gf /*in0*/, gf /*in1*/); +uint64_t PQCLEAN_MCELIECE6688128F_AVX_gf_mul2(gf /*a*/, gf /*b0*/, gf /*b1*/); +gf PQCLEAN_MCELIECE6688128F_AVX_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE6688128F_AVX_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE6688128F_AVX_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/int32_sort.c b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/int32_sort.c new file mode 100644 index 0000000000..f70f51ddab --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/int32_sort.c @@ -0,0 +1,1211 @@ +#include "int32_sort.h" +#define int32 int32_t + +#include + +typedef __m256i int32x8; +#define int32x8_load(z) _mm256_loadu_si256((__m256i_u *) (z)) +#define int32x8_store(z,i) _mm256_storeu_si256((__m256i_u *) (z),(i)) +#define int32x8_min _mm256_min_epi32 +#define int32x8_max _mm256_max_epi32 + +#define int32x8_MINMAX(a,b) \ + do { \ + int32x8 c = int32x8_min(a, b); \ + (b) = int32x8_max(a,b); \ + (a) = c; \ + } while(0) + +static inline void int32_MINMAX(int32 *a, int32 *b) { + int32 ab = *b ^ *a; + int32 c = *b - *a; + c ^= ab & (c ^ *b); + c >>= 31; + c &= ab; + *a ^= c; + *b ^= c; +} + +static void minmax_vector(int32 *x, int32 *y, size_t n) { + if (n < 8) { + while (n > 0) { + int32_MINMAX(x, y); + ++x; + ++y; + --n; + } + return; + } + if (n & 7) { + int32x8 x0 = int32x8_load(x + n - 8); + int32x8 y0 = int32x8_load(y + n - 8); + int32x8_MINMAX(x0, y0); + int32x8_store(x + n - 8, x0); + int32x8_store(y + n - 8, y0); + n &= ~7; + } + do { + int32x8 x0 = int32x8_load(x); + int32x8 y0 = int32x8_load(y); + int32x8_MINMAX(x0, y0); + int32x8_store(x, x0); + int32x8_store(y, y0); + x += 8; + y += 8; + n -= 8; + } while (n); +} + +/* stages 8,4,2,1 of size-16 bitonic merging */ +static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) { + int32x8 b0, b1, c0, c1, mask; + + int32x8_MINMAX(x0, x1); + + b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + + int32x8_MINMAX(b0, b1); + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */ + + x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */ + x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */ + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + } + + int32x8_store(&x[0], x0); + int32x8_store(&x[8], x1); +} + +/* stages 64,32 of bitonic merging; n is multiple of 128 */ +static void int32_twostages_32(int32 *x, size_t n) { + size_t i; + + while (n > 0) { + for (i = 0; i < 32; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + 32]); + int32x8 x2 = int32x8_load(&x[i + 64]); + int32x8 x3 = int32x8_load(&x[i + 96]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 32], x1); + int32x8_store(&x[i + 64], x2); + int32x8_store(&x[i + 96], x3); + } + x += 128; + n -= 128; + } +} + +/* stages 4q,2q,q of bitonic merging */ +static size_t int32_threestages(int32 *x, size_t n, size_t q) { + size_t k, i; + + for (k = 0; k + 8 * q <= n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + + return k; +} + +/* n is a power of 2; n >= 8; if n == 8 then flagdown */ +// NOLINTNEXTLINE(google-readability-function-size) +static void int32_sort_2power(int32 *x, size_t n, int flagdown) { + size_t p, q, i, j, k; + int32x8 mask; + + if (n == 8) { + int32 x0 = x[0]; + int32 x1 = x[1]; + int32 x2 = x[2]; + int32 x3 = x[3]; + int32 x4 = x[4]; + int32 x5 = x[5]; + int32 x6 = x[6]; + int32 x7 = x[7]; + + /* odd-even sort instead of bitonic sort */ + + int32_MINMAX(&x1, &x0); + int32_MINMAX(&x3, &x2); + int32_MINMAX(&x2, &x0); + int32_MINMAX(&x3, &x1); + int32_MINMAX(&x2, &x1); + + int32_MINMAX(&x5, &x4); + int32_MINMAX(&x7, &x6); + int32_MINMAX(&x6, &x4); + int32_MINMAX(&x7, &x5); + int32_MINMAX(&x6, &x5); + + int32_MINMAX(&x4, &x0); + int32_MINMAX(&x6, &x2); + int32_MINMAX(&x4, &x2); + + int32_MINMAX(&x5, &x1); + int32_MINMAX(&x7, &x3); + int32_MINMAX(&x5, &x3); + + int32_MINMAX(&x2, &x1); + int32_MINMAX(&x4, &x3); + int32_MINMAX(&x6, &x5); + + x[0] = x0; + x[1] = x1; + x[2] = x2; + x[3] = x3; + x[4] = x4; + x[5] = x5; + x[6] = x6; + x[7] = x7; + return; + } + + if (n == 16) { + int32x8 x0, x1, b0, b1, c0, c1; + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + + mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1); + + x0 ^= mask; /* A01234567 */ + x1 ^= mask; /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + c0 ^= mask; + c1 ^= mask; + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + b0 ^= mask; + b1 ^= mask; + + c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */ + c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */ + b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + mask = _mm256_set1_epi32(-1); + if (flagdown) { + x1 ^= mask; + } else { + x0 ^= mask; + } + + merge16_finish(x, x0, x1, flagdown); + return; + } + + if (n == 32) { + int32x8 x0, x1, x2, x3; + + int32_sort_2power(x, 16, 1); + int32_sort_2power(x + 16, 16, 0); + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + x2 = int32x8_load(&x[16]); + x3 = int32x8_load(&x[24]); + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + } + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + + merge16_finish(x, x0, x1, flagdown); + merge16_finish(x + 16, x2, x3, flagdown); + return; + } + + p = n >> 3; + for (i = 0; i < p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x2 = int32x8_load(&x[i + 2 * p]); + int32x8 x4 = int32x8_load(&x[i + 4 * p]); + int32x8 x6 = int32x8_load(&x[i + 6 * p]); + + /* odd-even stage instead of bitonic stage */ + + int32x8_MINMAX(x4, x0); + int32x8_MINMAX(x6, x2); + int32x8_MINMAX(x2, x0); + int32x8_MINMAX(x6, x4); + int32x8_MINMAX(x2, x4); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 2 * p], x2); + int32x8_store(&x[i + 4 * p], x4); + int32x8_store(&x[i + 6 * p], x6); + + int32x8 x1 = int32x8_load(&x[i + p]); + int32x8 x3 = int32x8_load(&x[i + 3 * p]); + int32x8 x5 = int32x8_load(&x[i + 5 * p]); + int32x8 x7 = int32x8_load(&x[i + 7 * p]); + + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x5, x3); + + int32x8_store(&x[i + p], x1); + int32x8_store(&x[i + 3 * p], x3); + int32x8_store(&x[i + 5 * p], x5); + int32x8_store(&x[i + 7 * p], x7); + } + + if (n >= 128) { + int flip, flipflip; + + mask = _mm256_set1_epi32(-1); + + for (j = 0; j < n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 16]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 16], x1); + } + + p = 8; + for (;;) { /* for p in [8, 16, ..., n/16] */ + q = p >> 1; + while (q >= 128) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + if (q == 64) { + int32_twostages_32(x, n); + q = 16; + } + if (q == 32) { + q = 8; + for (k = 0; k < n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q = 4; + } + if (q == 16) { + q = 8; + for (k = 0; k < n; k += 4 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q = 4; + } + if (q == 8) { + for (k = 0; k < n; k += q + q) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + } + } + + q = n >> 3; + flip = 0; + if (p << 1 == q) { + flip = 1; + } + flipflip = 1 - flip; + for (j = 0; j < q; j += p + p) { + for (k = j; k < j + p + p; k += p) { + for (i = k; i < k + p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + if (flip) { + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + x4 ^= mask; + x5 ^= mask; + x6 ^= mask; + x7 ^= mask; + } + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + flip ^= 1; + } + flip ^= flipflip; + } + + if (p << 4 == n) { + break; + } + p <<= 1; + } + } + + for (p = 4; p >= 1; p >>= 1) { + int32 *z = x; + int32 *target = x + n; + if (p == 4) { + mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&z[0], x0); + int32x8_store(&z[8], x1); + z += 16; + } + } else if (p == 2) { + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8_MINMAX(b0, b1); + int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20); + int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31); + int32x8_store(&z[0], c0); + int32x8_store(&z[8], c1); + z += 16; + } + } else { /* p == 1 */ + mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */ + int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */ + int32x8_MINMAX(d0, d1); + int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20); + int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31); + int32x8_store(&z[0], e0); + int32x8_store(&z[8], e1); + z += 16; + } + } + + q = n >> 4; + while (q >= 128 || q == 32) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (k = j; k < j + q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += 2 * q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (k = 0; k < q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + int32x8 x4 = int32x8_load(&x[k + 4 * q]); + int32x8 x5 = int32x8_load(&x[k + 5 * q]); + int32x8 x6 = int32x8_load(&x[k + 6 * q]); + int32x8 x7 = int32x8_load(&x[k + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + int32x8_store(&x[k + 4 * q], x4); + int32x8_store(&x[k + 5 * q], x5); + int32x8_store(&x[k + 6 * q], x6); + int32x8_store(&x[k + 7 * q], x7); + } + } + + /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */ + mask = _mm256_set1_epi32(-1); + + for (i = 0; i < n; i += 64) { + int32x8 a0 = int32x8_load(&x[i]); + int32x8 a1 = int32x8_load(&x[i + 8]); + int32x8 a2 = int32x8_load(&x[i + 16]); + int32x8 a3 = int32x8_load(&x[i + 24]); + int32x8 a4 = int32x8_load(&x[i + 32]); + int32x8 a5 = int32x8_load(&x[i + 40]); + int32x8 a6 = int32x8_load(&x[i + 48]); + int32x8 a7 = int32x8_load(&x[i + 56]); + + int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */ + int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */ + int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */ + int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */ + int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */ + int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */ + int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */ + + if (flagdown) { + c2 ^= mask; + c3 ^= mask; + c6 ^= mask; + c7 ^= mask; + } else { + c0 ^= mask; + c1 ^= mask; + c4 ^= mask; + c5 ^= mask; + } + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */ + int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */ + int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */ + + int32x8_MINMAX(d0, d1); + int32x8_MINMAX(d2, d3); + int32x8_MINMAX(d4, d5); + int32x8_MINMAX(d6, d7); + int32x8_MINMAX(d0, d2); + int32x8_MINMAX(d1, d3); + int32x8_MINMAX(d4, d6); + int32x8_MINMAX(d5, d7); + int32x8_MINMAX(d0, d4); + int32x8_MINMAX(d1, d5); + int32x8_MINMAX(d2, d6); + int32x8_MINMAX(d3, d7); + + int32x8 e0 = _mm256_unpacklo_epi32(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi32(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi32(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi32(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi32(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi32(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi32(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi32(d6, d7); + + int32x8 f0 = _mm256_unpacklo_epi64(e0, e2); + int32x8 f1 = _mm256_unpacklo_epi64(e1, e3); + int32x8 f2 = _mm256_unpackhi_epi64(e0, e2); + int32x8 f3 = _mm256_unpackhi_epi64(e1, e3); + int32x8 f4 = _mm256_unpacklo_epi64(e4, e6); + int32x8 f5 = _mm256_unpacklo_epi64(e5, e7); + int32x8 f6 = _mm256_unpackhi_epi64(e4, e6); + int32x8 f7 = _mm256_unpackhi_epi64(e5, e7); + + int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20); + int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20); + int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20); + int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20); + int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31); + int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31); + int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31); + int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31); + + int32x8_store(&x[i], g0); + int32x8_store(&x[i + 8], g1); + int32x8_store(&x[i + 16], g2); + int32x8_store(&x[i + 24], g3); + int32x8_store(&x[i + 32], g4); + int32x8_store(&x[i + 40], g5); + int32x8_store(&x[i + 48], g6); + int32x8_store(&x[i + 56], g7); + } + + q = n >> 4; + while (q >= 128 || q == 32) { + q >>= 2; + for (j = 0; j < n; j += 8 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q >>= 1; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += q + q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (i = 0; i < q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */ + int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */ + int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */ + int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */ + int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */ + int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */ + int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */ + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */ + int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */ + int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */ + + if (flagdown) { + d0 ^= mask; + d1 ^= mask; + d2 ^= mask; + d3 ^= mask; + d4 ^= mask; + d5 ^= mask; + d6 ^= mask; + d7 ^= mask; + } + + int32x8_store(&x[i], d0); + int32x8_store(&x[i + q], d4); + int32x8_store(&x[i + 2 * q], d1); + int32x8_store(&x[i + 3 * q], d5); + int32x8_store(&x[i + 4 * q], d2); + int32x8_store(&x[i + 5 * q], d6); + int32x8_store(&x[i + 6 * q], d3); + int32x8_store(&x[i + 7 * q], d7); + } +} + +void PQCLEAN_MCELIECE6688128F_AVX_int32_sort(int32 *x, size_t n) { + size_t q, i, j; + + if (n <= 8) { + if (n == 8) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + int32_MINMAX(&x[6], &x[7]); + } + if (n >= 7) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + } + if (n >= 6) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + } + if (n >= 5) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + } + if (n >= 4) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + } + if (n >= 3) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + } + if (n >= 2) { + int32_MINMAX(&x[0], &x[1]); + } + return; + } + + if (!(n & (n - 1))) { + int32_sort_2power(x, n, 0); + return; + } + + q = 8; + while (q < n - q) { + q += q; + } + /* n > q >= 8 */ + + if (q <= 128) { /* n <= 256 */ + int32x8 y[32]; + for (i = q >> 3; i < q >> 2; ++i) { + y[i] = _mm256_set1_epi32(0x7fffffff); + } + for (i = 0; i < n; ++i) { + ((int32 *)y)[i] = x[i]; + } + int32_sort_2power((int32 *) y, 2 * q, 0); + for (i = 0; i < n; ++i) { + x[i] = ((int32 *) y)[i]; + } + return; + } + + int32_sort_2power(x, q, 1); + PQCLEAN_MCELIECE6688128F_AVX_int32_sort(x + q, n - q); + + while (q >= 64) { + q >>= 2; + j = int32_threestages(x, n, q); + minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j); + if (j + 4 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + j += 4 * q; + } + minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j); + if (j + 2 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + } + j += 2 * q; + } + minmax_vector(x + j, x + j + q, n - q - j); + q >>= 1; + } + if (q == 32) { + j = 0; + for (; j + 64 <= n; j += 64) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8 x4 = int32x8_load(&x[j + 32]); + int32x8 x5 = int32x8_load(&x[j + 40]); + int32x8 x6 = int32x8_load(&x[j + 48]); + int32x8 x7 = int32x8_load(&x[j + 56]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20); + int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31); + int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20); + int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8_MINMAX(a4, a5); + int32x8_MINMAX(a6, a7); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20); + int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31); + int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20); + int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8 c4 = _mm256_unpacklo_epi64(b4, b5); + int32x8 c5 = _mm256_unpackhi_epi64(b4, b5); + int32x8 c6 = _mm256_unpacklo_epi64(b6, b7); + int32x8 c7 = _mm256_unpackhi_epi64(b6, b7); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8_MINMAX(c4, c5); + int32x8_MINMAX(c6, c7); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 d4 = _mm256_unpacklo_epi32(c4, c5); + int32x8 d5 = _mm256_unpackhi_epi32(c4, c5); + int32x8 d6 = _mm256_unpacklo_epi32(c6, c7); + int32x8 d7 = _mm256_unpackhi_epi32(c6, c7); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi64(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi64(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi64(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi64(d6, d7); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8_MINMAX(e4, e5); + int32x8_MINMAX(e6, e7); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8 f4 = _mm256_unpacklo_epi32(e4, e5); + int32x8 f5 = _mm256_unpackhi_epi32(e4, e5); + int32x8 f6 = _mm256_unpacklo_epi32(e6, e7); + int32x8 f7 = _mm256_unpackhi_epi32(e6, e7); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + int32x8_store(&x[j + 32], f4); + int32x8_store(&x[j + 40], f5); + int32x8_store(&x[j + 48], f6); + int32x8_store(&x[j + 56], f7); + } + minmax_vector(x + j, x + j + 32, n - 32 - j); + goto continue16; + } + if (q == 16) { + j = 0; +continue16: + for (; j + 32 <= n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + } + minmax_vector(x + j, x + j + 16, n - 16 - j); + goto continue8; + } + /* q == 8 */ + j = 0; +continue8: + for (; j + 16 <= n; j += 16) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 8], x1); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */ + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */ + int32x8_MINMAX(a0, a1); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */ + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */ + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */ + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */ + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */ + int32x8_MINMAX(e0, e1); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */ + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */ + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + } + minmax_vector(x + j, x + j + 8, n - 8 - j); + if (j + 8 <= n) { + int32_MINMAX(&x[j], &x[j + 4]); + int32_MINMAX(&x[j + 1], &x[j + 5]); + int32_MINMAX(&x[j + 2], &x[j + 6]); + int32_MINMAX(&x[j + 3], &x[j + 7]); + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + int32_MINMAX(&x[j + 4], &x[j + 6]); + int32_MINMAX(&x[j + 5], &x[j + 7]); + int32_MINMAX(&x[j + 4], &x[j + 5]); + int32_MINMAX(&x[j + 6], &x[j + 7]); + j += 8; + } + minmax_vector(x + j, x + j + 4, n - 4 - j); + if (j + 4 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + j += 4; + } + if (j + 3 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + } + if (j + 2 <= n) { + int32_MINMAX(&x[j], &x[j + 1]); + } +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/int32_sort.h b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/int32_sort.h new file mode 100644 index 0000000000..fa408e9421 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/int32_sort.h @@ -0,0 +1,9 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_INT32_SORT_H +#define PQCLEAN_MCELIECE6688128F_AVX_INT32_SORT_H + +#include +#include + +void PQCLEAN_MCELIECE6688128F_AVX_int32_sort(int32_t *x, size_t n); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/operations.c b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/operations.c new file mode 100644 index 0000000000..be8c6bcff5 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE6688128F_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE6688128F_AVX_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE6688128F_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE6688128F_AVX_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE6688128F_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE6688128F_AVX_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE6688128F_AVX_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE6688128F_AVX_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE6688128F_AVX_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE6688128F_AVX_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE6688128F_AVX_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE6688128F_AVX_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE6688128F_AVX_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/params.h b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/params.h new file mode 100644 index 0000000000..9cbe21f407 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_PARAMS_H +#define PQCLEAN_MCELIECE6688128F_AVX_PARAMS_H + +#define GFBITS 13 +#define SYS_N 6688 +#define SYS_T 128 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/pk_gen.c b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/pk_gen.c new file mode 100644 index 0000000000..93432c0fbc --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/pk_gen.c @@ -0,0 +1,360 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void de_bitslicing(uint64_t *out, vec256 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 256 + 0 * 64 + r] <<= 1; + out[i * 256 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 256 + 1 * 64 + r] <<= 1; + out[i * 256 + 1 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i][j], 2); + for (r = 0; r < 64; r++) { + out[i * 256 + 2 * 64 + r] <<= 1; + out[i * 256 + 2 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(in[i][j], 3); + for (r = 0; r < 64; r++) { + out[i * 256 + 3 * 64 + r] <<= 1; + out[i * 256 + 3 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec256 out0[][GFBITS], vec256 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[4] = {0}; + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + } +} + +static void transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + return (int)_tzcnt_u64(in); +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint64_t mat[][ ((SYS_N + 255) / 256) * 4 ], uint32_t *perm) { + int i, j, k, s, block_idx, row; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 64; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = (mat[ row + i ][ block_idx + 0 ] >> 32) | + (mat[ row + i ][ block_idx + 1 ] << 32); + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask(k, ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = (mat[ i + j ][ block_idx + 0 ] >> 32) | + (mat[ i + j ][ block_idx + 1 ] << 32); + } + + transpose_64x64(buf, buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask(k, ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + transpose_64x64(buf, buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + mat[ i + j ][ block_idx + 0 ] = (mat[ i + j ][ block_idx + 0 ] << 32 >> 32) | (buf[j] << 32); + mat[ i + j ][ block_idx + 1 ] = (mat[ i + j ][ block_idx + 1 ] >> 32 << 32) | (buf[j] >> 32); + } + } + + return 0; +} + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 255) / 256) +#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) +int PQCLEAN_MCELIECE6688128F_AVX_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 4 ]; + + uint64_t mask; + + vec128 sk_int[ GFBITS ]; + + vec256 consts[ 32 ][ GFBITS ]; + vec256 eval[ 32 ][ GFBITS ]; + vec256 prod[ 32 ][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + + // compute the inverses + + PQCLEAN_MCELIECE6688128F_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE6688128F_AVX_fft(eval, sk_int); + + PQCLEAN_MCELIECE6688128F_AVX_vec256_copy(prod[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE6688128F_AVX_vec256_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec256_inv(tmp, prod[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE6688128F_AVX_vec256_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE6688128F_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6688128F_AVX_vec256_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE6688128F_AVX_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE6688128F_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + // gaussian elimination + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < row; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + + for (i = 0; i < GFBITS * SYS_T; i++) { + for (j = NBLOCKS1_I; j < NBLOCKS1_H - 1; j++) { + PQCLEAN_MCELIECE6688128F_AVX_store8(pk, mat[i][j]); + pk += 8; + } + + PQCLEAN_MCELIECE6688128F_AVX_store_i(pk, mat[i][j], PK_ROW_BYTES % 8); + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/pk_gen.h b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/pk_gen.h new file mode 100644 index 0000000000..923c3a2ee4 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_PK_GEN_H +#define PQCLEAN_MCELIECE6688128F_AVX_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE6688128F_AVX_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/powers.inc b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/powers.inc new file mode 100644 index 0000000000..aa25d7098f --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/powers.inc @@ -0,0 +1,480 @@ +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/scalars_2x.inc b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/scalars_2x.inc new file mode 100644 index 0000000000..42d6b9f032 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/scalars_2x.inc @@ -0,0 +1,75 @@ +{ + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000) +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/scalars_4x.inc b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/scalars_4x.inc new file mode 100644 index 0000000000..573f60904f --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/scalars_4x.inc @@ -0,0 +1,91 @@ +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330, 0XF0F30C33CF03F03F, 0X3F30CC0C000F3FCC), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C, 0X00F30FC00C3300FF, 0XFC3CF030FC3FFF03), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC, 0XF3CC3CF3F3FCF33F, 0X33FFFCFF0CCF3CC3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30, 0X3C0FC0FC303C3F3C, 0X003CFF33C3CC30CF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC, 0XFC30CF303F3FF00F, 0XCFF3CF33C00F3003), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F, 0X33300C0CC3300CF3, 0X00F3CC0CF3003CCF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF, 0X3C030CF3F03FF3F3, 0X3C000CFCCC3C3333), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3, 0X3CCC03FCCC3FFC03, 0XF3CF03C0FCF03FF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030, 0X033C3C3CF0003FC3, 0X3F3C3CF0C330330C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC, 0XFFC0FF00F0FF0F03, 0X33CCFCC0FF0033F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3, 0XF3F30CF003FCC303, 0X33C300C0F0C003F3), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF, 0X30CFCFC3CC0F3000, 0X003FF0003F00C00C), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33, 0X0CF30CCF3FCFCC0F, 0XCFF3C3033F030FFF) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00, 0X0F0F00FF0FF0FFFF, 0XFF0F0F00F000F0FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF, 0XF000F0F00F00FF0F, 0X0FFFFFFFFF00000F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF, 0X000FFFF0FFF0FF0F, 0XF0FFFF000F00F0FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F, 0X00F00FFF00000FF0, 0X0F0000F00FFF0FFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFF0000F0FF000F0, 0XFF000000FFF00000, 0XFFFFF0000FFFF00F, 0X0F0F0F00FF0F000F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0, 0XFFF0FFF0000FFFF0, 0X000F0F0FFFF0F000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F, 0XF0F0F0000F0F0F00, 0XF0FFFF0F00F0FF0F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000, 0X00F000F0F00FFF00, 0X0F0F000F0F00F0FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00F000F0FFF00F00, 0X00F00F00F00F000F, 0XF0FF0F0FFF00F0FF, 0X0000F0FF00FF0F0F), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0, 0XF0FF0FFFF0F0F0FF, 0X00FFFF0FF0FFF0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF, 0X00FFFFFFFFFFFFF0, 0X0000000F00F0FFF0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF, 0X00FFF0F0FF000F0F, 0XF0F00000FF00F0F0), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00, 0X000FFFF0000FFF00, 0X0F0F0FFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00FF0000000000FF, 0X00FF00FF00FF0000, 0XFFFF00FF00FF00FF, 0XFF0000FFFFFF00FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF, 0X00FFFF000000FF00, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FF00FF0000, 0X0000FFFF000000FF, 0XFFFF00FFFFFFFF00, 0XFFFF000000FFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF000000FF0000, 0X00FFFF00FF000000, 0X0000FFFF00FFFFFF, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF, 0X00FF0000FF0000FF, 0XFFFFFF00FFFFFF00), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00, 0XFFFF0000FF00FFFF, 0X00FFFF00FFFF00FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00, 0XFF000000FFFFFF00, 0X0000FFFF00FF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF, 0X000000000000FFFF, 0X000000FFFF000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00FFFFFFFF00FF00, 0X0000FF0000000000, 0XFF00FF00FFFF0000, 0XFF00FF0000FF00FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF000000000000, 0XFF00FFFF00FFFF00, 0XFFFF00FFFF00FFFF, 0X00FF0000000000FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00FF0000FF000000, 0X00FFFF00000000FF, 0XFFFFFFFFFF00FF00, 0XFF00FFFF00FF00FF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF, 0XFFFF00FFFF0000FF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000, 0X0000FF00000000FF, 0X0000FF000000FFFF) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X000000000000FFFF, 0X0000FFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000FFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF, 0XFFFFFFFF00000000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFF000000000000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000000000000000, 0X0000FFFF00000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000, 0X00000000FFFF0000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFF0000FFFF0000, 0X0000FFFFFFFFFFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0X0000FFFFFFFFFFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFFFFFF00000000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF, 0X000000000000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF000000000000, 0X00000000FFFF0000, 0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFFFFFF0000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFF0000FFFF, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFF0000FFFFFFFF, 0X0000FFFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000) +}, +{ + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF) +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/sk_gen.c b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/sk_gen.c new file mode 100644 index 0000000000..21937714b9 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE6688128F_AVX_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE6688128F_AVX_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE6688128F_AVX_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE6688128F_AVX_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE6688128F_AVX_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE6688128F_AVX_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE6688128F_AVX_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE6688128F_AVX_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/sk_gen.h b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/sk_gen.h new file mode 100644 index 0000000000..c87cf34bfd --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_SK_GEN_H +#define PQCLEAN_MCELIECE6688128F_AVX_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE6688128F_AVX_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE6688128F_AVX_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/syndrome_asm.S b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/syndrome_asm.S new file mode 100644 index 0000000000..1b9d6d1744 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/syndrome_asm.S @@ -0,0 +1,810 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: reg256 pp + +# qhasm: reg256 ee + +# qhasm: reg256 ss + +# qhasm: int64 buf_ptr + +# qhasm: stack256 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6688128F_AVX_syndrome_asm +.global PQCLEAN_MCELIECE6688128F_AVX_syndrome_asm +_PQCLEAN_MCELIECE6688128F_AVX_syndrome_asm: +PQCLEAN_MCELIECE6688128F_AVX_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 1044364 +# asm 1: add $1044364,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 1664 +# asm 1: mov $1664,>row=int64#5 +# asm 2: mov $1664,>row=%r8 +mov $1664,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 208 ] +# asm 1: vmovupd 208(ee=reg256#2 +# asm 2: vmovupd 208(ee=%ymm1 +vmovupd 208(%rdx),%ymm1 + +# qhasm: ss &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 32(pp=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 240 ] +# asm 1: vmovupd 240(ee=reg256#3 +# asm 2: vmovupd 240(ee=%ymm2 +vmovupd 240(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 64(pp=%ymm1 +vmovupd 64(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 272 ] +# asm 1: vmovupd 272(ee=reg256#3 +# asm 2: vmovupd 272(ee=%ymm2 +vmovupd 272(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 96(pp=%ymm1 +vmovupd 96(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 304 ] +# asm 1: vmovupd 304(ee=reg256#3 +# asm 2: vmovupd 304(ee=%ymm2 +vmovupd 304(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 128(pp=%ymm1 +vmovupd 128(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 336 ] +# asm 1: vmovupd 336(ee=reg256#3 +# asm 2: vmovupd 336(ee=%ymm2 +vmovupd 336(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 160(pp=%ymm1 +vmovupd 160(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 368 ] +# asm 1: vmovupd 368(ee=reg256#3 +# asm 2: vmovupd 368(ee=%ymm2 +vmovupd 368(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 192(pp=%ymm1 +vmovupd 192(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 400 ] +# asm 1: vmovupd 400(ee=reg256#3 +# asm 2: vmovupd 400(ee=%ymm2 +vmovupd 400(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 224(pp=%ymm1 +vmovupd 224(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 432 ] +# asm 1: vmovupd 432(ee=reg256#3 +# asm 2: vmovupd 432(ee=%ymm2 +vmovupd 432(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 256(pp=%ymm1 +vmovupd 256(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 464 ] +# asm 1: vmovupd 464(ee=reg256#3 +# asm 2: vmovupd 464(ee=%ymm2 +vmovupd 464(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 288(pp=%ymm1 +vmovupd 288(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 496 ] +# asm 1: vmovupd 496(ee=reg256#3 +# asm 2: vmovupd 496(ee=%ymm2 +vmovupd 496(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 320(pp=%ymm1 +vmovupd 320(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 528 ] +# asm 1: vmovupd 528(ee=reg256#3 +# asm 2: vmovupd 528(ee=%ymm2 +vmovupd 528(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 352(pp=%ymm1 +vmovupd 352(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 560 ] +# asm 1: vmovupd 560(ee=reg256#3 +# asm 2: vmovupd 560(ee=%ymm2 +vmovupd 560(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 384(pp=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 592 ] +# asm 1: vmovupd 592(ee=reg256#3 +# asm 2: vmovupd 592(ee=%ymm2 +vmovupd 592(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 416(pp=%ymm1 +vmovupd 416(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 624 ] +# asm 1: vmovupd 624(ee=reg256#3 +# asm 2: vmovupd 624(ee=%ymm2 +vmovupd 624(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 448(pp=%ymm1 +vmovupd 448(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 656 ] +# asm 1: vmovupd 656(ee=reg256#3 +# asm 2: vmovupd 656(ee=%ymm2 +vmovupd 656(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 480(pp=%ymm1 +vmovupd 480(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 688 ] +# asm 1: vmovupd 688(ee=reg256#3 +# asm 2: vmovupd 688(ee=%ymm2 +vmovupd 688(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 512(pp=%ymm1 +vmovupd 512(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 720 ] +# asm 1: vmovupd 720(ee=reg256#3 +# asm 2: vmovupd 720(ee=%ymm2 +vmovupd 720(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 544(pp=%ymm1 +vmovupd 544(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 752 ] +# asm 1: vmovupd 752(ee=reg256#3 +# asm 2: vmovupd 752(ee=%ymm2 +vmovupd 752(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 576(pp=%ymm1 +vmovupd 576(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 784 ] +# asm 1: vmovupd 784(ee=reg256#3 +# asm 2: vmovupd 784(ee=%ymm2 +vmovupd 784(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand buf=stack256#1 +# asm 2: vmovapd buf=0(%rsp) +vmovapd %ymm0,0(%rsp) + +# qhasm: s = mem64[input_1 + 608] +# asm 1: movq 608(s=int64#6 +# asm 2: movq 608(s=%r9 +movq 608(%rsi),%r9 + +# qhasm: e = mem64[input_2 + 816] +# asm 1: movq 816(e=int64#7 +# asm 2: movq 816(e=%rax +movq 816(%rdx),%rax + +# qhasm: s &= e +# asm 1: and p=int64#7 +# asm 2: movq 616(p=%rax +movq 616(%rsi),%rax + +# qhasm: e = mem64[input_2 + 824] +# asm 1: movq 824(e=int64#8 +# asm 2: movq 824(e=%r10 +movq 824(%rdx),%r10 + +# qhasm: p &= e +# asm 1: and p=int64#7d +# asm 2: movl 624(p=%eax +movl 624(%rsi),%eax + +# qhasm: e = *(uint32 *)(input_2 + 832) +# asm 1: movl 832(e=int64#8d +# asm 2: movl 832(e=%r10d +movl 832(%rdx),%r10d + +# qhasm: p &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 16(b64=%rax +movq 16(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 24(b64=%rax +movq 24(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(ee=reg256#2 +# asm 2: vmovupd 0(ee=%ymm1 +vmovupd 0(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 32(ss=%ymm0 +vmovupd 32(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(ee=reg256#2 +# asm 2: vmovupd 32(ee=%ymm1 +vmovupd 32(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 64(ss=%ymm0 +vmovupd 64(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(ee=reg256#2 +# asm 2: vmovupd 64(ee=%ymm1 +vmovupd 64(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 96(ss=%ymm0 +vmovupd 96(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 96 ] +# asm 1: vmovupd 96(ee=reg256#2 +# asm 2: vmovupd 96(ee=%ymm1 +vmovupd 96(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 128(ss=%ymm0 +vmovupd 128(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 128 ] +# asm 1: vmovupd 128(ee=reg256#2 +# asm 2: vmovupd 128(ee=%ymm1 +vmovupd 128(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 160(ss=%ymm0 +vmovupd 160(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 160 ] +# asm 1: vmovupd 160(ee=reg256#2 +# asm 2: vmovupd 160(ee=%ymm1 +vmovupd 160(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor s=int64#2 +# asm 2: movq 192(s=%rsi +movq 192(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 192 ] +# asm 1: movq 192(e=int64#4 +# asm 2: movq 192(e=%rcx +movq 192(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movq 200(s=%rsi +movq 200(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 200 ] +# asm 1: movq 200(e=int64#3 +# asm 2: movq 200(e=%rdx +movq 200(%rdx),%rdx + +# qhasm: s ^= e +# asm 1: xor mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE6688128F_AVX_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE6688128F_AVX_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE6688128F_AVX_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE6688128F_AVX_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE6688128F_AVX_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE6688128F_AVX_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE6688128F_AVX_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE6688128F_AVX_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE6688128F_AVX_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE6688128F_AVX_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE6688128F_AVX_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK5_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK5_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE6688128F_AVX_MASK5_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK5_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK5_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK5_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE6688128F_AVX_MASK4_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK4_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK4_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK4_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE6688128F_AVX_MASK4_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK4_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK4_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK4_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE6688128F_AVX_MASK3_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK3_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK3_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK3_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE6688128F_AVX_MASK3_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK3_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK3_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK3_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(x1=reg256#8 +# asm 2: vmovupd 256(x1=%ymm7 +vmovupd 256(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 512 ] +# asm 1: vmovupd 512(x2=reg256#9 +# asm 2: vmovupd 512(x2=%ymm8 +vmovupd 512(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 768 ] +# asm 1: vmovupd 768(x3=reg256#10 +# asm 2: vmovupd 768(x3=%ymm9 +vmovupd 768(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1024 ] +# asm 1: vmovupd 1024(x4=reg256#11 +# asm 2: vmovupd 1024(x4=%ymm10 +vmovupd 1024(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1280 ] +# asm 1: vmovupd 1280(x5=reg256#12 +# asm 2: vmovupd 1280(x5=%ymm11 +vmovupd 1280(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1536 ] +# asm 1: vmovupd 1536(x6=reg256#13 +# asm 2: vmovupd 1536(x6=%ymm12 +vmovupd 1536(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1792 ] +# asm 1: vmovupd 1792(x7=reg256#14 +# asm 2: vmovupd 1792(x7=%ymm13 +vmovupd 1792(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 32(x0=%ymm6 +vmovupd 32(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x2=reg256#9 +# asm 2: vmovupd 544(x2=%ymm8 +vmovupd 544(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x3=reg256#10 +# asm 2: vmovupd 800(x3=%ymm9 +vmovupd 800(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x4=reg256#11 +# asm 2: vmovupd 1056(x4=%ymm10 +vmovupd 1056(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x5=reg256#12 +# asm 2: vmovupd 1312(x5=%ymm11 +vmovupd 1312(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x6=reg256#13 +# asm 2: vmovupd 1568(x6=%ymm12 +vmovupd 1568(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x7=reg256#14 +# asm 2: vmovupd 1824(x7=%ymm13 +vmovupd 1824(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 32 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 64(x0=%ymm6 +vmovupd 64(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x1=reg256#8 +# asm 2: vmovupd 320(x1=%ymm7 +vmovupd 320(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x3=reg256#10 +# asm 2: vmovupd 832(x3=%ymm9 +vmovupd 832(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x4=reg256#11 +# asm 2: vmovupd 1088(x4=%ymm10 +vmovupd 1088(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x5=reg256#12 +# asm 2: vmovupd 1344(x5=%ymm11 +vmovupd 1344(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x6=reg256#13 +# asm 2: vmovupd 1600(x6=%ymm12 +vmovupd 1600(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x7=reg256#14 +# asm 2: vmovupd 1856(x7=%ymm13 +vmovupd 1856(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 64 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 96(x0=%ymm6 +vmovupd 96(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x1=reg256#8 +# asm 2: vmovupd 352(x1=%ymm7 +vmovupd 352(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x2=reg256#9 +# asm 2: vmovupd 608(x2=%ymm8 +vmovupd 608(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x4=reg256#11 +# asm 2: vmovupd 1120(x4=%ymm10 +vmovupd 1120(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x5=reg256#12 +# asm 2: vmovupd 1376(x5=%ymm11 +vmovupd 1376(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x6=reg256#13 +# asm 2: vmovupd 1632(x6=%ymm12 +vmovupd 1632(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x7=reg256#14 +# asm 2: vmovupd 1888(x7=%ymm13 +vmovupd 1888(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 96 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 128(x0=%ymm6 +vmovupd 128(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x1=reg256#8 +# asm 2: vmovupd 384(x1=%ymm7 +vmovupd 384(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x2=reg256#9 +# asm 2: vmovupd 640(x2=%ymm8 +vmovupd 640(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x3=reg256#10 +# asm 2: vmovupd 896(x3=%ymm9 +vmovupd 896(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x5=reg256#12 +# asm 2: vmovupd 1408(x5=%ymm11 +vmovupd 1408(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x6=reg256#13 +# asm 2: vmovupd 1664(x6=%ymm12 +vmovupd 1664(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x7=reg256#14 +# asm 2: vmovupd 1920(x7=%ymm13 +vmovupd 1920(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 128 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 160(x0=%ymm6 +vmovupd 160(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x1=reg256#8 +# asm 2: vmovupd 416(x1=%ymm7 +vmovupd 416(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x2=reg256#9 +# asm 2: vmovupd 672(x2=%ymm8 +vmovupd 672(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x3=reg256#10 +# asm 2: vmovupd 928(x3=%ymm9 +vmovupd 928(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x4=reg256#11 +# asm 2: vmovupd 1184(x4=%ymm10 +vmovupd 1184(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x6=reg256#13 +# asm 2: vmovupd 1696(x6=%ymm12 +vmovupd 1696(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x7=reg256#14 +# asm 2: vmovupd 1952(x7=%ymm13 +vmovupd 1952(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 160 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 192(x0=%ymm6 +vmovupd 192(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x1=reg256#8 +# asm 2: vmovupd 448(x1=%ymm7 +vmovupd 448(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x2=reg256#9 +# asm 2: vmovupd 704(x2=%ymm8 +vmovupd 704(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x3=reg256#10 +# asm 2: vmovupd 960(x3=%ymm9 +vmovupd 960(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x4=reg256#11 +# asm 2: vmovupd 1216(x4=%ymm10 +vmovupd 1216(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x5=reg256#12 +# asm 2: vmovupd 1472(x5=%ymm11 +vmovupd 1472(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x7=reg256#14 +# asm 2: vmovupd 1984(x7=%ymm13 +vmovupd 1984(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 192 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 224(x0=%ymm6 +vmovupd 224(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x1=reg256#8 +# asm 2: vmovupd 480(x1=%ymm7 +vmovupd 480(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x2=reg256#9 +# asm 2: vmovupd 736(x2=%ymm8 +vmovupd 736(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x3=reg256#10 +# asm 2: vmovupd 992(x3=%ymm9 +vmovupd 992(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x4=reg256#11 +# asm 2: vmovupd 1248(x4=%ymm10 +vmovupd 1248(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x5=reg256#12 +# asm 2: vmovupd 1504(x5=%ymm11 +vmovupd 1504(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x6=reg256#13 +# asm 2: vmovupd 1760(x6=%ymm12 +vmovupd 1760(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#1 +# asm 2: vpand v00=%ymm0 +vpand %ymm9,%ymm0,%ymm0 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#13 +# asm 2: vpsllq $32,v10=%ymm12 +vpsllq $32,%ymm13,%ymm12 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm0,%ymm12,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#13 +# asm 2: vpslld $16,v10=%ymm12 +vpslld $16,%ymm11,%ymm12 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#14 +# asm 2: vpsrld $16,v01=%ymm13 +vpsrld $16,%ymm14,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm0,%ymm13 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm8,%ymm13 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#3 +# asm 2: vpand v00=%ymm2 +vpand %ymm7,%ymm2,%ymm2 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#9 +# asm 2: vpslld $16,v10=%ymm8 +vpslld $16,%ymm1,%ymm8 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm2,%ymm8,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#8 +# asm 2: vpsllw $8,v10=%ymm7 +vpsllw $8,%ymm12,%ymm7 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#9 +# asm 2: vpsrlw $8,v01=%ymm8 +vpsrlw $8,%ymm9,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#10 +# asm 2: vpsllw $8,v10=%ymm9 +vpsllw $8,%ymm0,%ymm9 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#12 +# asm 2: vpsllw $8,v10=%ymm11 +vpsllw $8,%ymm2,%ymm11 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#11 +# asm 2: vpsrlw $8,v01=%ymm10 +vpsrlw $8,%ymm10,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#5 +# asm 2: vpand v00=%ymm4 +vpand %ymm6,%ymm4,%ymm4 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#11 +# asm 2: vpsllw $8,v10=%ymm10 +vpsllw $8,%ymm1,%ymm10 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm4,%ymm10,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 224 ] = x0 +# asm 1: vmovupd mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK2_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK2_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE6688128F_AVX_MASK2_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK2_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK2_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK2_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE6688128F_AVX_MASK1_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK1_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK1_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK1_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE6688128F_AVX_MASK1_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK1_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK1_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK1_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE6688128F_AVX_MASK0_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK0_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK0_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK0_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE6688128F_AVX_MASK0_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK0_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK0_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE6688128F_AVX_MASK0_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(x1=reg256#8 +# asm 2: vmovupd 32(x1=%ymm7 +vmovupd 32(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(x2=reg256#9 +# asm 2: vmovupd 64(x2=%ymm8 +vmovupd 64(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(x3=reg256#10 +# asm 2: vmovupd 96(x3=%ymm9 +vmovupd 96(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(x4=reg256#11 +# asm 2: vmovupd 128(x4=%ymm10 +vmovupd 128(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(x5=reg256#12 +# asm 2: vmovupd 160(x5=%ymm11 +vmovupd 160(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(x6=reg256#13 +# asm 2: vmovupd 192(x6=%ymm12 +vmovupd 192(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(x7=reg256#14 +# asm 2: vmovupd 224(x7=%ymm13 +vmovupd 224(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 256(x0=%ymm6 +vmovupd 256(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x2=reg256#9 +# asm 2: vmovupd 320(x2=%ymm8 +vmovupd 320(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x3=reg256#10 +# asm 2: vmovupd 352(x3=%ymm9 +vmovupd 352(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x4=reg256#11 +# asm 2: vmovupd 384(x4=%ymm10 +vmovupd 384(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x5=reg256#12 +# asm 2: vmovupd 416(x5=%ymm11 +vmovupd 416(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x6=reg256#13 +# asm 2: vmovupd 448(x6=%ymm12 +vmovupd 448(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x7=reg256#14 +# asm 2: vmovupd 480(x7=%ymm13 +vmovupd 480(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 256 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 512(x0=%ymm6 +vmovupd 512(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x1=reg256#8 +# asm 2: vmovupd 544(x1=%ymm7 +vmovupd 544(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x3=reg256#10 +# asm 2: vmovupd 608(x3=%ymm9 +vmovupd 608(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x4=reg256#11 +# asm 2: vmovupd 640(x4=%ymm10 +vmovupd 640(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x5=reg256#12 +# asm 2: vmovupd 672(x5=%ymm11 +vmovupd 672(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x6=reg256#13 +# asm 2: vmovupd 704(x6=%ymm12 +vmovupd 704(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x7=reg256#14 +# asm 2: vmovupd 736(x7=%ymm13 +vmovupd 736(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 512 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 768(x0=%ymm6 +vmovupd 768(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x1=reg256#8 +# asm 2: vmovupd 800(x1=%ymm7 +vmovupd 800(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x2=reg256#9 +# asm 2: vmovupd 832(x2=%ymm8 +vmovupd 832(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x4=reg256#11 +# asm 2: vmovupd 896(x4=%ymm10 +vmovupd 896(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x5=reg256#12 +# asm 2: vmovupd 928(x5=%ymm11 +vmovupd 928(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x6=reg256#13 +# asm 2: vmovupd 960(x6=%ymm12 +vmovupd 960(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x7=reg256#14 +# asm 2: vmovupd 992(x7=%ymm13 +vmovupd 992(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 768 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1024(x0=%ymm6 +vmovupd 1024(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x1=reg256#8 +# asm 2: vmovupd 1056(x1=%ymm7 +vmovupd 1056(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x2=reg256#9 +# asm 2: vmovupd 1088(x2=%ymm8 +vmovupd 1088(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x3=reg256#10 +# asm 2: vmovupd 1120(x3=%ymm9 +vmovupd 1120(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x5=reg256#12 +# asm 2: vmovupd 1184(x5=%ymm11 +vmovupd 1184(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x6=reg256#13 +# asm 2: vmovupd 1216(x6=%ymm12 +vmovupd 1216(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x7=reg256#14 +# asm 2: vmovupd 1248(x7=%ymm13 +vmovupd 1248(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1024 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1280(x0=%ymm6 +vmovupd 1280(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x1=reg256#8 +# asm 2: vmovupd 1312(x1=%ymm7 +vmovupd 1312(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x2=reg256#9 +# asm 2: vmovupd 1344(x2=%ymm8 +vmovupd 1344(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x3=reg256#10 +# asm 2: vmovupd 1376(x3=%ymm9 +vmovupd 1376(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x4=reg256#11 +# asm 2: vmovupd 1408(x4=%ymm10 +vmovupd 1408(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x6=reg256#13 +# asm 2: vmovupd 1472(x6=%ymm12 +vmovupd 1472(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x7=reg256#14 +# asm 2: vmovupd 1504(x7=%ymm13 +vmovupd 1504(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1280 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1536(x0=%ymm6 +vmovupd 1536(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x1=reg256#8 +# asm 2: vmovupd 1568(x1=%ymm7 +vmovupd 1568(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x2=reg256#9 +# asm 2: vmovupd 1600(x2=%ymm8 +vmovupd 1600(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x3=reg256#10 +# asm 2: vmovupd 1632(x3=%ymm9 +vmovupd 1632(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x4=reg256#11 +# asm 2: vmovupd 1664(x4=%ymm10 +vmovupd 1664(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x5=reg256#12 +# asm 2: vmovupd 1696(x5=%ymm11 +vmovupd 1696(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x7=reg256#14 +# asm 2: vmovupd 1760(x7=%ymm13 +vmovupd 1760(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1536 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1792(x0=%ymm6 +vmovupd 1792(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x1=reg256#8 +# asm 2: vmovupd 1824(x1=%ymm7 +vmovupd 1824(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x2=reg256#9 +# asm 2: vmovupd 1856(x2=%ymm8 +vmovupd 1856(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x3=reg256#10 +# asm 2: vmovupd 1888(x3=%ymm9 +vmovupd 1888(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x4=reg256#11 +# asm 2: vmovupd 1920(x4=%ymm10 +vmovupd 1920(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x5=reg256#12 +# asm 2: vmovupd 1952(x5=%ymm11 +vmovupd 1952(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x6=reg256#13 +# asm 2: vmovupd 1984(x6=%ymm12 +vmovupd 1984(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#1 +# asm 2: vpand v10=%ymm0 +vpand %ymm13,%ymm0,%ymm0 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm12,%ymm0,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#13 +# asm 2: vpand v10=%ymm12 +vpand %ymm11,%ymm2,%ymm12 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm14,%ymm3,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm0,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm8,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#3 +# asm 2: vpand v10=%ymm2 +vpand %ymm1,%ymm2,%ymm2 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm8,%ymm2,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#8 +# asm 2: vpand v10=%ymm7 +vpand %ymm12,%ymm4,%ymm7 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm9,%ymm5,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#10 +# asm 2: vpand v10=%ymm9 +vpand %ymm0,%ymm4,%ymm9 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#12 +# asm 2: vpand v10=%ymm11 +vpand %ymm2,%ymm4,%ymm11 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm4,%ymm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#5 +# asm 2: vpand v10=%ymm4 +vpand %ymm1,%ymm4,%ymm4 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm10,%ymm4,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 1792 ] = x0 +# asm 1: vmovupd +#include + +void PQCLEAN_MCELIECE6688128F_AVX_uint32_sort(uint32_t *x, size_t n); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/update_asm.S b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/update_asm.S new file mode 100644 index 0000000000..ee626daf9a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/update_asm.S @@ -0,0 +1,576 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 s0 + +# qhasm: int64 s1 + +# qhasm: int64 s2 + +# qhasm: enter update_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6688128F_AVX_update_asm +.global PQCLEAN_MCELIECE6688128F_AVX_update_asm +_PQCLEAN_MCELIECE6688128F_AVX_update_asm: +PQCLEAN_MCELIECE6688128F_AVX_update_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: s2 = input_1 +# asm 1: mov s2=int64#2 +# asm 2: mov s2=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1, + +void PQCLEAN_MCELIECE6688128F_AVX_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE6688128F_AVX_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE6688128F_AVX_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE6688128F_AVX_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE6688128F_AVX_irr_load(vec128 *out, const unsigned char *in) { + int i, j; + uint64_t v[2]; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE6688128F_AVX_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + v[0] = v[1] = 0; + + for (j = 63; j >= 0; j--) { + v[0] <<= 1; + v[0] |= (irr[j] >> i) & 1; + } + for (j = SYS_T; j >= 64; j--) { + v[1] <<= 1; + v[1] |= (irr[j] >> i) & 1; + } + + out[i] = PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(v[0], v[1]); + } +} + +void PQCLEAN_MCELIECE6688128F_AVX_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE6688128F_AVX_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +vec128 PQCLEAN_MCELIECE6688128F_AVX_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x( PQCLEAN_MCELIECE6688128F_AVX_load8(in), PQCLEAN_MCELIECE6688128F_AVX_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE6688128F_AVX_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE6688128F_AVX_store8(out + 0, PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(in, 0)); + PQCLEAN_MCELIECE6688128F_AVX_store8(out + 8, PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(in, 1)); +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/util.h b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/util.h new file mode 100644 index 0000000000..e206e40ae3 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/util.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_UTIL_H +#define PQCLEAN_MCELIECE6688128F_AVX_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE6688128F_AVX_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE6688128F_AVX_store2(unsigned char *dest, uint16_t a); +uint16_t PQCLEAN_MCELIECE6688128F_AVX_load2(const unsigned char *src); +uint32_t PQCLEAN_MCELIECE6688128F_AVX_load4(const unsigned char *src); +void PQCLEAN_MCELIECE6688128F_AVX_irr_load(vec128 *out, const unsigned char *in); +void PQCLEAN_MCELIECE6688128F_AVX_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE6688128F_AVX_load8(const unsigned char *in); +vec128 PQCLEAN_MCELIECE6688128F_AVX_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE6688128F_AVX_store16(unsigned char *out, vec128 in); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/vec128.c b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/vec128.c new file mode 100644 index 0000000000..16533dc382 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/vec128.c @@ -0,0 +1,79 @@ +#include "vec128.h" + +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE6688128F_AVX_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE6688128F_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE6688128F_AVX_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6688128F_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE6688128F_AVX_vec128_mul_asm(h, f, g, 16); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/vec128.h b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/vec128.h new file mode 100644 index 0000000000..afc46d46f4 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/vec128.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_MCELIECE6688128F_AVX_VEC128_H +#define PQCLEAN_MCELIECE6688128F_AVX_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE6688128F_AVX_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE6688128F_AVX_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE6688128F_AVX_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE6688128F_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE6688128F_AVX_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE6688128F_AVX_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6688128F_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/vec128_mul_asm.S b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/vec128_mul_asm.S new file mode 100644 index 0000000000..e97f1d61dc --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/vec128_mul_asm.S @@ -0,0 +1,1816 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 b2 + +# qhasm: reg256 b3 + +# qhasm: reg256 b4 + +# qhasm: reg256 b5 + +# qhasm: reg256 b6 + +# qhasm: reg256 b7 + +# qhasm: reg256 b8 + +# qhasm: reg256 b9 + +# qhasm: reg256 b10 + +# qhasm: reg256 b11 + +# qhasm: reg256 b12 + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: reg128 h0 + +# qhasm: reg128 h1 + +# qhasm: reg128 h2 + +# qhasm: reg128 h3 + +# qhasm: reg128 h4 + +# qhasm: reg128 h5 + +# qhasm: reg128 h6 + +# qhasm: reg128 h7 + +# qhasm: reg128 h8 + +# qhasm: reg128 h9 + +# qhasm: reg128 h10 + +# qhasm: reg128 h11 + +# qhasm: reg128 h12 + +# qhasm: reg128 h13 + +# qhasm: reg128 h14 + +# qhasm: reg128 h15 + +# qhasm: reg128 h16 + +# qhasm: reg128 h17 + +# qhasm: reg128 h18 + +# qhasm: reg128 h19 + +# qhasm: reg128 h20 + +# qhasm: reg128 h21 + +# qhasm: reg128 h22 + +# qhasm: reg128 h23 + +# qhasm: reg128 h24 + +# qhasm: stack4864 buf + +# qhasm: int64 ptr + +# qhasm: int64 tmp + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6688128F_AVX_vec128_mul_asm +.global PQCLEAN_MCELIECE6688128F_AVX_vec128_mul_asm +_PQCLEAN_MCELIECE6688128F_AVX_vec128_mul_asm: +PQCLEAN_MCELIECE6688128F_AVX_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $608,%r11 +sub %r11,%rsp + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#5 +# asm 2: leaq ptr=%r8 +leaq 0(%rsp),%r8 + +# qhasm: tmp = input_3 +# asm 1: mov tmp=int64#6 +# asm 2: mov tmp=%r9 +mov %rcx,%r9 + +# qhasm: tmp *= 12 +# asm 1: imulq $12,tmp=int64#6 +# asm 2: imulq $12,tmp=%r9 +imulq $12,%r9,%r9 + +# qhasm: input_2 += tmp +# asm 1: add b12=reg256#1 +# asm 2: vbroadcasti128 0(b12=%ymm0 +vbroadcasti128 0(%rdx), %ymm0 + +# qhasm: input_2 -= input_3 +# asm 1: sub a6=reg256#2 +# asm 2: vpxor a6=%ymm1 +vpxor %ymm1,%ymm1,%ymm1 + +# qhasm: a6[0] = mem128[ input_1 + 96 ] +# asm 1: vinsertf128 $0x0,96(r18=reg256#3 +# asm 2: vpand r18=%ymm2 +vpand %ymm0,%ymm1,%ymm2 + +# qhasm: mem256[ ptr + 576 ] = r18 +# asm 1: vmovupd r17=reg256#4 +# asm 2: vpand r17=%ymm3 +vpand %ymm0,%ymm2,%ymm3 + +# qhasm: a4[0] = mem128[ input_1 + 64 ] +# asm 1: vinsertf128 $0x0,64(r16=reg256#6 +# asm 2: vpand r16=%ymm5 +vpand %ymm0,%ymm4,%ymm5 + +# qhasm: a3[0] = mem128[ input_1 + 48 ] +# asm 1: vinsertf128 $0x0,48(r15=reg256#8 +# asm 2: vpand r15=%ymm7 +vpand %ymm0,%ymm6,%ymm7 + +# qhasm: a2[0] = mem128[ input_1 + 32 ] +# asm 1: vinsertf128 $0x0,32(r14=reg256#10 +# asm 2: vpand r14=%ymm9 +vpand %ymm0,%ymm8,%ymm9 + +# qhasm: a1[0] = mem128[ input_1 + 16 ] +# asm 1: vinsertf128 $0x0,16(r13=reg256#12 +# asm 2: vpand r13=%ymm11 +vpand %ymm0,%ymm10,%ymm11 + +# qhasm: a0[0] = mem128[ input_1 + 0 ] +# asm 1: vinsertf128 $0x0,0(r12=reg256#1 +# asm 2: vpand r12=%ymm0 +vpand %ymm0,%ymm12,%ymm0 + +# qhasm: b11 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b11=reg256#14 +# asm 2: vbroadcasti128 0(b11=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r12 ^= r +# asm 1: vpxor r11=reg256#4 +# asm 2: vpand r11=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b10 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b10=reg256#14 +# asm 2: vbroadcasti128 0(b10=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r11 ^= r +# asm 1: vpxor r10=reg256#6 +# asm 2: vpand r10=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b9 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b9=reg256#14 +# asm 2: vbroadcasti128 0(b9=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r10 ^= r +# asm 1: vpxor r9=reg256#8 +# asm 2: vpand r9=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b8 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b8=reg256#14 +# asm 2: vbroadcasti128 0(b8=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r9 ^= r +# asm 1: vpxor r8=reg256#10 +# asm 2: vpand r8=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b7 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b7=reg256#14 +# asm 2: vbroadcasti128 0(b7=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r8 ^= r +# asm 1: vpxor r7=reg256#12 +# asm 2: vpand r7=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b6 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b6=reg256#14 +# asm 2: vbroadcasti128 0(b6=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r6=reg256#1 +# asm 2: vpand r6=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: b5 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b5=reg256#14 +# asm 2: vbroadcasti128 0(b5=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r6 ^= r +# asm 1: vpxor r5=reg256#4 +# asm 2: vpand r5=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b4 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b4=reg256#14 +# asm 2: vbroadcasti128 0(b4=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#6 +# asm 2: vpand r4=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b3 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b3=reg256#14 +# asm 2: vbroadcasti128 0(b3=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#8 +# asm 2: vpand r3=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b2 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b2=reg256#14 +# asm 2: vbroadcasti128 0(b2=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#10 +# asm 2: vpand r2=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b1 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b1=reg256#14 +# asm 2: vbroadcasti128 0(b1=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#12 +# asm 2: vpand r1=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b0 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b0=reg256#14 +# asm 2: vbroadcasti128 0(b0=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#1 +# asm 2: vpand r0=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: mem256[ ptr + 160 ] = r5 +# asm 1: vmovupd h24=reg128#1 +# asm 2: movdqu 560(h24=%xmm0 +movdqu 560(%r8),%xmm0 + +# qhasm: h11 = h24 +# asm 1: movdqa h11=reg128#2 +# asm 2: movdqa h11=%xmm1 +movdqa %xmm0,%xmm1 + +# qhasm: h12 = h24 +# asm 1: movdqa h12=reg128#3 +# asm 2: movdqa h12=%xmm2 +movdqa %xmm0,%xmm2 + +# qhasm: h14 = h24 +# asm 1: movdqa h14=reg128#4 +# asm 2: movdqa h14=%xmm3 +movdqa %xmm0,%xmm3 + +# qhasm: h15 = h24 +# asm 1: movdqa h15=reg128#1 +# asm 2: movdqa h15=%xmm0 +movdqa %xmm0,%xmm0 + +# qhasm: h23 = mem128[ ptr + 528 ] +# asm 1: movdqu 528(h23=reg128#5 +# asm 2: movdqu 528(h23=%xmm4 +movdqu 528(%r8),%xmm4 + +# qhasm: h10 = h23 +# asm 1: movdqa h10=reg128#6 +# asm 2: movdqa h10=%xmm5 +movdqa %xmm4,%xmm5 + +# qhasm: h11 = h11 ^ h23 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm4,%xmm1,%xmm1 + +# qhasm: h13 = h23 +# asm 1: movdqa h13=reg128#7 +# asm 2: movdqa h13=%xmm6 +movdqa %xmm4,%xmm6 + +# qhasm: h14 = h14 ^ h23 +# asm 1: vpxor h14=reg128#4 +# asm 2: vpxor h14=%xmm3 +vpxor %xmm4,%xmm3,%xmm3 + +# qhasm: h22 = mem128[ ptr + 496 ] +# asm 1: movdqu 496(h22=reg128#5 +# asm 2: movdqu 496(h22=%xmm4 +movdqu 496(%r8),%xmm4 + +# qhasm: h9 = h22 +# asm 1: movdqa h9=reg128#8 +# asm 2: movdqa h9=%xmm7 +movdqa %xmm4,%xmm7 + +# qhasm: h10 = h10 ^ h22 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm4,%xmm5,%xmm5 + +# qhasm: h12 = h12 ^ h22 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm4,%xmm2,%xmm2 + +# qhasm: h13 = h13 ^ h22 +# asm 1: vpxor h13=reg128#5 +# asm 2: vpxor h13=%xmm4 +vpxor %xmm4,%xmm6,%xmm4 + +# qhasm: h21 = mem128[ ptr + 464 ] +# asm 1: movdqu 464(h21=reg128#7 +# asm 2: movdqu 464(h21=%xmm6 +movdqu 464(%r8),%xmm6 + +# qhasm: h8 = h21 +# asm 1: movdqa h8=reg128#9 +# asm 2: movdqa h8=%xmm8 +movdqa %xmm6,%xmm8 + +# qhasm: h9 = h9 ^ h21 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h11 = h11 ^ h21 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h12 = h12 ^ h21 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm6,%xmm2,%xmm2 + +# qhasm: h20 = mem128[ ptr + 432 ] +# asm 1: movdqu 432(h20=reg128#7 +# asm 2: movdqu 432(h20=%xmm6 +movdqu 432(%r8),%xmm6 + +# qhasm: h7 = h20 +# asm 1: movdqa h7=reg128#10 +# asm 2: movdqa h7=%xmm9 +movdqa %xmm6,%xmm9 + +# qhasm: h8 = h8 ^ h20 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h10 = h10 ^ h20 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h11 = h11 ^ h20 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h19 = mem128[ ptr + 400 ] +# asm 1: movdqu 400(h19=reg128#7 +# asm 2: movdqu 400(h19=%xmm6 +movdqu 400(%r8),%xmm6 + +# qhasm: h6 = h19 +# asm 1: movdqa h6=reg128#11 +# asm 2: movdqa h6=%xmm10 +movdqa %xmm6,%xmm10 + +# qhasm: h7 = h7 ^ h19 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm6,%xmm9,%xmm9 + +# qhasm: h9 = h9 ^ h19 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h10 = h10 ^ h19 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h18 = mem128[ ptr + 368 ] +# asm 1: movdqu 368(h18=reg128#7 +# asm 2: movdqu 368(h18=%xmm6 +movdqu 368(%r8),%xmm6 + +# qhasm: h18 = h18 ^ mem128[ ptr + 576 ] +# asm 1: vpxor 576(h18=reg128#7 +# asm 2: vpxor 576(h18=%xmm6 +vpxor 576(%r8),%xmm6,%xmm6 + +# qhasm: h5 = h18 +# asm 1: movdqa h5=reg128#12 +# asm 2: movdqa h5=%xmm11 +movdqa %xmm6,%xmm11 + +# qhasm: h6 = h6 ^ h18 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm6,%xmm10,%xmm10 + +# qhasm: h8 = h8 ^ h18 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h9 = h9 ^ h18 +# asm 1: vpxor h9=reg128#7 +# asm 2: vpxor h9=%xmm6 +vpxor %xmm6,%xmm7,%xmm6 + +# qhasm: h17 = mem128[ ptr + 336 ] +# asm 1: movdqu 336(h17=reg128#8 +# asm 2: movdqu 336(h17=%xmm7 +movdqu 336(%r8),%xmm7 + +# qhasm: h17 = h17 ^ mem128[ ptr + 544 ] +# asm 1: vpxor 544(h17=reg128#8 +# asm 2: vpxor 544(h17=%xmm7 +vpxor 544(%r8),%xmm7,%xmm7 + +# qhasm: h4 = h17 +# asm 1: movdqa h4=reg128#13 +# asm 2: movdqa h4=%xmm12 +movdqa %xmm7,%xmm12 + +# qhasm: h5 = h5 ^ h17 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm7,%xmm11,%xmm11 + +# qhasm: h7 = h7 ^ h17 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm7,%xmm9,%xmm9 + +# qhasm: h8 = h8 ^ h17 +# asm 1: vpxor h8=reg128#8 +# asm 2: vpxor h8=%xmm7 +vpxor %xmm7,%xmm8,%xmm7 + +# qhasm: h16 = mem128[ ptr + 304 ] +# asm 1: movdqu 304(h16=reg128#9 +# asm 2: movdqu 304(h16=%xmm8 +movdqu 304(%r8),%xmm8 + +# qhasm: h16 = h16 ^ mem128[ ptr + 512 ] +# asm 1: vpxor 512(h16=reg128#9 +# asm 2: vpxor 512(h16=%xmm8 +vpxor 512(%r8),%xmm8,%xmm8 + +# qhasm: h3 = h16 +# asm 1: movdqa h3=reg128#14 +# asm 2: movdqa h3=%xmm13 +movdqa %xmm8,%xmm13 + +# qhasm: h4 = h4 ^ h16 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm8,%xmm12,%xmm12 + +# qhasm: h6 = h6 ^ h16 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm8,%xmm10,%xmm10 + +# qhasm: h7 = h7 ^ h16 +# asm 1: vpxor h7=reg128#9 +# asm 2: vpxor h7=%xmm8 +vpxor %xmm8,%xmm9,%xmm8 + +# qhasm: h15 = h15 ^ mem128[ ptr + 272 ] +# asm 1: vpxor 272(h15=reg128#1 +# asm 2: vpxor 272(h15=%xmm0 +vpxor 272(%r8),%xmm0,%xmm0 + +# qhasm: h15 = h15 ^ mem128[ ptr + 480 ] +# asm 1: vpxor 480(h15=reg128#1 +# asm 2: vpxor 480(h15=%xmm0 +vpxor 480(%r8),%xmm0,%xmm0 + +# qhasm: h2 = h15 +# asm 1: movdqa h2=reg128#10 +# asm 2: movdqa h2=%xmm9 +movdqa %xmm0,%xmm9 + +# qhasm: h3 = h3 ^ h15 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm0,%xmm13,%xmm13 + +# qhasm: h5 = h5 ^ h15 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm0,%xmm11,%xmm11 + +# qhasm: h6 = h6 ^ h15 +# asm 1: vpxor h6=reg128#1 +# asm 2: vpxor h6=%xmm0 +vpxor %xmm0,%xmm10,%xmm0 + +# qhasm: h14 = h14 ^ mem128[ ptr + 240 ] +# asm 1: vpxor 240(h14=reg128#4 +# asm 2: vpxor 240(h14=%xmm3 +vpxor 240(%r8),%xmm3,%xmm3 + +# qhasm: h14 = h14 ^ mem128[ ptr + 448 ] +# asm 1: vpxor 448(h14=reg128#4 +# asm 2: vpxor 448(h14=%xmm3 +vpxor 448(%r8),%xmm3,%xmm3 + +# qhasm: h1 = h14 +# asm 1: movdqa h1=reg128#11 +# asm 2: movdqa h1=%xmm10 +movdqa %xmm3,%xmm10 + +# qhasm: h2 = h2 ^ h14 +# asm 1: vpxor h2=reg128#10 +# asm 2: vpxor h2=%xmm9 +vpxor %xmm3,%xmm9,%xmm9 + +# qhasm: h4 = h4 ^ h14 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm3,%xmm12,%xmm12 + +# qhasm: h5 = h5 ^ h14 +# asm 1: vpxor h5=reg128#4 +# asm 2: vpxor h5=%xmm3 +vpxor %xmm3,%xmm11,%xmm3 + +# qhasm: h13 = h13 ^ mem128[ ptr + 208 ] +# asm 1: vpxor 208(h13=reg128#5 +# asm 2: vpxor 208(h13=%xmm4 +vpxor 208(%r8),%xmm4,%xmm4 + +# qhasm: h13 = h13 ^ mem128[ ptr + 416 ] +# asm 1: vpxor 416(h13=reg128#5 +# asm 2: vpxor 416(h13=%xmm4 +vpxor 416(%r8),%xmm4,%xmm4 + +# qhasm: h0 = h13 +# asm 1: movdqa h0=reg128#12 +# asm 2: movdqa h0=%xmm11 +movdqa %xmm4,%xmm11 + +# qhasm: h1 = h1 ^ h13 +# asm 1: vpxor h1=reg128#11 +# asm 2: vpxor h1=%xmm10 +vpxor %xmm4,%xmm10,%xmm10 + +# qhasm: h3 = h3 ^ h13 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm4,%xmm13,%xmm13 + +# qhasm: h4 = h4 ^ h13 +# asm 1: vpxor h4=reg128#5 +# asm 2: vpxor h4=%xmm4 +vpxor %xmm4,%xmm12,%xmm4 + +# qhasm: h12 = h12 ^ mem128[ ptr + 384 ] +# asm 1: vpxor 384(h12=reg128#3 +# asm 2: vpxor 384(h12=%xmm2 +vpxor 384(%r8),%xmm2,%xmm2 + +# qhasm: h12 = h12 ^ mem128[ ptr + 176 ] +# asm 1: vpxor 176(h12=reg128#3 +# asm 2: vpxor 176(h12=%xmm2 +vpxor 176(%r8),%xmm2,%xmm2 + +# qhasm: mem128[ input_0 + 192 ] = h12 +# asm 1: movdqu h11=reg128#2 +# asm 2: vpxor 352(h11=%xmm1 +vpxor 352(%r8),%xmm1,%xmm1 + +# qhasm: h11 = h11 ^ mem128[ ptr + 144 ] +# asm 1: vpxor 144(h11=reg128#2 +# asm 2: vpxor 144(h11=%xmm1 +vpxor 144(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 176 ] = h11 +# asm 1: movdqu h10=reg128#2 +# asm 2: vpxor 320(h10=%xmm1 +vpxor 320(%r8),%xmm5,%xmm1 + +# qhasm: h10 = h10 ^ mem128[ ptr + 112 ] +# asm 1: vpxor 112(h10=reg128#2 +# asm 2: vpxor 112(h10=%xmm1 +vpxor 112(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 160 ] = h10 +# asm 1: movdqu h9=reg128#2 +# asm 2: vpxor 288(h9=%xmm1 +vpxor 288(%r8),%xmm6,%xmm1 + +# qhasm: h9 = h9 ^ mem128[ ptr + 80 ] +# asm 1: vpxor 80(h9=reg128#2 +# asm 2: vpxor 80(h9=%xmm1 +vpxor 80(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 144 ] = h9 +# asm 1: movdqu h8=reg128#2 +# asm 2: vpxor 256(h8=%xmm1 +vpxor 256(%r8),%xmm7,%xmm1 + +# qhasm: h8 = h8 ^ mem128[ ptr + 48 ] +# asm 1: vpxor 48(h8=reg128#2 +# asm 2: vpxor 48(h8=%xmm1 +vpxor 48(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 128 ] = h8 +# asm 1: movdqu h7=reg128#2 +# asm 2: vpxor 224(h7=%xmm1 +vpxor 224(%r8),%xmm8,%xmm1 + +# qhasm: h7 = h7 ^ mem128[ ptr + 16 ] +# asm 1: vpxor 16(h7=reg128#2 +# asm 2: vpxor 16(h7=%xmm1 +vpxor 16(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = h7 +# asm 1: movdqu h6=reg128#1 +# asm 2: vpxor 192(h6=%xmm0 +vpxor 192(%r8),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 96 ] = h6 +# asm 1: movdqu h5=reg128#1 +# asm 2: vpxor 160(h5=%xmm0 +vpxor 160(%r8),%xmm3,%xmm0 + +# qhasm: mem128[ input_0 + 80 ] = h5 +# asm 1: movdqu h4=reg128#1 +# asm 2: vpxor 128(h4=%xmm0 +vpxor 128(%r8),%xmm4,%xmm0 + +# qhasm: mem128[ input_0 + 64 ] = h4 +# asm 1: movdqu h3=reg128#1 +# asm 2: vpxor 96(h3=%xmm0 +vpxor 96(%r8),%xmm13,%xmm0 + +# qhasm: mem128[ input_0 + 48 ] = h3 +# asm 1: movdqu h2=reg128#1 +# asm 2: vpxor 64(h2=%xmm0 +vpxor 64(%r8),%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 32 ] = h2 +# asm 1: movdqu h1=reg128#1 +# asm 2: vpxor 32(h1=%xmm0 +vpxor 32(%r8),%xmm10,%xmm0 + +# qhasm: mem128[ input_0 + 16 ] = h1 +# asm 1: movdqu h0=reg128#1 +# asm 2: vpxor 0(h0=%xmm0 +vpxor 0(%r8),%xmm11,%xmm0 + +# qhasm: mem128[ input_0 + 0 ] = h0 +# asm 1: movdqu + +typedef __m256i vec256; + +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_set1_16b(uint16_t a); +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_setzero(void); +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_set4x(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); + +// Extract requires a literal argument so need to be macros +#define PQCLEAN_MCELIECE6688128F_AVX_vec256_extract2x(a,i) ((vec128) _mm256_extractf128_si256((vec256) (a),(i))) +#define PQCLEAN_MCELIECE6688128F_AVX_vec256_extract(a,i) ((uint64_t) _mm256_extract_epi64((vec256) (a),(i))) + +int PQCLEAN_MCELIECE6688128F_AVX_vec256_testz(vec256 a); +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_and(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_xor(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_or(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_sll_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_srl_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_low(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_high(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_low_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_unpack_high_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6688128F_AVX_vec256_or_reduce(const vec256 *a); +void PQCLEAN_MCELIECE6688128F_AVX_vec256_copy(vec256 *dest, const vec256 *src); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6688128F_AVX_vec256_mul(vec256 *h, vec256 *f, const vec256 *g); +void PQCLEAN_MCELIECE6688128F_AVX_vec256_sq(vec256 * /*out*/, const vec256 * /*in*/); +void PQCLEAN_MCELIECE6688128F_AVX_vec256_inv(vec256 * /*out*/, const vec256 * /*in*/); + +extern void PQCLEAN_MCELIECE6688128F_AVX_vec256_maa_asm(vec256 *, vec256 *, const vec256 *); +extern void PQCLEAN_MCELIECE6688128F_AVX_vec256_ama_asm(vec256 *, vec256 *, const vec256 *); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/vec256_ama_asm.S b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/vec256_ama_asm.S new file mode 100644 index 0000000000..043839c194 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6688128f_avx/vec256_ama_asm.S @@ -0,0 +1,2322 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 a7 + +# qhasm: reg256 a8 + +# qhasm: reg256 a9 + +# qhasm: reg256 a10 + +# qhasm: reg256 a11 + +# qhasm: reg256 a12 + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: enter vec256_ama_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6688128F_AVX_vec256_ama_asm +.global PQCLEAN_MCELIECE6688128F_AVX_vec256_ama_asm +_PQCLEAN_MCELIECE6688128F_AVX_vec256_ama_asm: +PQCLEAN_MCELIECE6688128F_AVX_vec256_ama_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rdi),%ymm1 + +# qhasm: a12 = a12 ^ mem256[ input_1 + 384 ] +# asm 1: vpxor 384(a12=reg256#2 +# asm 2: vpxor 384(a12=%ymm1 +vpxor 384(%rsi),%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 384 ] = a12 +# asm 1: vmovupd r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rdi),%ymm14 + +# qhasm: a11 = a11 ^ mem256[ input_1 + 352 ] +# asm 1: vpxor 352(a11=reg256#15 +# asm 2: vpxor 352(a11=%ymm14 +vpxor 352(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 352 ] = a11 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rdi),%ymm14 + +# qhasm: a10 = a10 ^ mem256[ input_1 + 320 ] +# asm 1: vpxor 320(a10=reg256#15 +# asm 2: vpxor 320(a10=%ymm14 +vpxor 320(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 320 ] = a10 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rdi),%ymm14 + +# qhasm: a9 = a9 ^ mem256[ input_1 + 288 ] +# asm 1: vpxor 288(a9=reg256#15 +# asm 2: vpxor 288(a9=%ymm14 +vpxor 288(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 288 ] = a9 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rdi),%ymm14 + +# qhasm: a8 = a8 ^ mem256[ input_1 + 256 ] +# asm 1: vpxor 256(a8=reg256#15 +# asm 2: vpxor 256(a8=%ymm14 +vpxor 256(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 256 ] = a8 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rdi),%ymm14 + +# qhasm: a7 = a7 ^ mem256[ input_1 + 224 ] +# asm 1: vpxor 224(a7=reg256#15 +# asm 2: vpxor 224(a7=%ymm14 +vpxor 224(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 224 ] = a7 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rdi),%ymm14 + +# qhasm: a6 = a6 ^ mem256[ input_1 + 192 ] +# asm 1: vpxor 192(a6=reg256#15 +# asm 2: vpxor 192(a6=%ymm14 +vpxor 192(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 192 ] = a6 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rdi),%ymm14 + +# qhasm: a5 = a5 ^ mem256[ input_1 + 160 ] +# asm 1: vpxor 160(a5=reg256#15 +# asm 2: vpxor 160(a5=%ymm14 +vpxor 160(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 160 ] = a5 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rdi),%ymm14 + +# qhasm: a4 = a4 ^ mem256[ input_1 + 128 ] +# asm 1: vpxor 128(a4=reg256#15 +# asm 2: vpxor 128(a4=%ymm14 +vpxor 128(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 128 ] = a4 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rdi),%ymm14 + +# qhasm: a3 = a3 ^ mem256[ input_1 + 96 ] +# asm 1: vpxor 96(a3=reg256#15 +# asm 2: vpxor 96(a3=%ymm14 +vpxor 96(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 96 ] = a3 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rdi),%ymm14 + +# qhasm: a2 = a2 ^ mem256[ input_1 + 64 ] +# asm 1: vpxor 64(a2=reg256#15 +# asm 2: vpxor 64(a2=%ymm14 +vpxor 64(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 64 ] = a2 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rdi),%ymm14 + +# qhasm: a1 = a1 ^ mem256[ input_1 + 32 ] +# asm 1: vpxor 32(a1=reg256#15 +# asm 2: vpxor 32(a1=%ymm14 +vpxor 32(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 32 ] = a1 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rdi),%ymm14 + +# qhasm: a0 = a0 ^ mem256[ input_1 + 0 ] +# asm 1: vpxor 0(a0=reg256#15 +# asm 2: vpxor 0(a0=%ymm14 +vpxor 0(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 0 ] = a0 +# asm 1: vmovupd r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm2,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm1,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm13,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm12,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm11,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm10,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm9,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm8,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm7,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm6,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm5,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm4,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm3,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rdi),%ymm2,%ymm0 + +# qhasm: mem256[ input_0 + 384 ] = r12 +# asm 1: vmovupd r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rdi),%ymm1,%ymm0 + +# qhasm: mem256[ input_0 + 352 ] = r11 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rdi),%ymm13,%ymm0 + +# qhasm: mem256[ input_0 + 320 ] = r10 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rdi),%ymm12,%ymm0 + +# qhasm: mem256[ input_0 + 288 ] = r9 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rdi),%ymm11,%ymm0 + +# qhasm: mem256[ input_0 + 256 ] = r8 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rdi),%ymm10,%ymm0 + +# qhasm: mem256[ input_0 + 224 ] = r7 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rdi),%ymm9,%ymm0 + +# qhasm: mem256[ input_0 + 192 ] = r6 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rdi),%ymm8,%ymm0 + +# qhasm: mem256[ input_0 + 160 ] = r5 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rdi),%ymm7,%ymm0 + +# qhasm: mem256[ input_0 + 128 ] = r4 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rdi),%ymm6,%ymm0 + +# qhasm: mem256[ input_0 + 96 ] = r3 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rdi),%ymm5,%ymm0 + +# qhasm: mem256[ input_0 + 64 ] = r2 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rdi),%ymm4,%ymm0 + +# qhasm: mem256[ input_0 + 32 ] = r1 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rdi),%ymm3,%ymm0 + +# qhasm: mem256[ input_0 + 0 ] = r0 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t0 = mem64[ input_0 + 192 ] +# asm 1: movq 192(t0=int64#2 +# asm 2: movq 192(t0=%rsi +movq 192(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(t1=int64#3 +# asm 2: movq 200(t1=%rdx +movq 200(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 176(t0=%rsi +movq 176(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(t1=int64#3 +# asm 2: movq 184(t1=%rdx +movq 184(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 160(t0=%rsi +movq 160(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(t1=int64#3 +# asm 2: movq 168(t1=%rdx +movq 168(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 144(t0=%rsi +movq 144(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(t1=int64#3 +# asm 2: movq 152(t1=%rdx +movq 152(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 128(t0=%rsi +movq 128(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(t1=int64#3 +# asm 2: movq 136(t1=%rdx +movq 136(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 112(t0=%rsi +movq 112(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(t1=int64#3 +# asm 2: movq 120(t1=%rdx +movq 120(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 96(t0=%rsi +movq 96(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(t1=int64#3 +# asm 2: movq 104(t1=%rdx +movq 104(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 80(t0=%rsi +movq 80(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(t1=int64#3 +# asm 2: movq 88(t1=%rdx +movq 88(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 64(t0=%rsi +movq 64(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(t1=int64#3 +# asm 2: movq 72(t1=%rdx +movq 72(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 48(t0=%rsi +movq 48(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(t1=int64#3 +# asm 2: movq 56(t1=%rdx +movq 56(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 32(t0=%rsi +movq 32(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(t1=int64#3 +# asm 2: movq 40(t1=%rdx +movq 40(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 16(t0=%rsi +movq 16(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(t1=int64#3 +# asm 2: movq 24(t1=%rdx +movq 24(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 0(t0=%rsi +movq 0(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(t1=int64#1 +# asm 2: movq 8(t1=%rdi +movq 8(%rdi),%rdi + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rsi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE6960119_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/api.h b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/api.h new file mode 100644 index 0000000000..ddae8a93ed --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_API_H +#define PQCLEAN_MCELIECE6960119_AVX_API_H + +#include + +#define PQCLEAN_MCELIECE6960119_AVX_CRYPTO_ALGNAME "Classic McEliece 6960119" +#define PQCLEAN_MCELIECE6960119_AVX_CRYPTO_PUBLICKEYBYTES 1047319 +#define PQCLEAN_MCELIECE6960119_AVX_CRYPTO_SECRETKEYBYTES 13908 +#define PQCLEAN_MCELIECE6960119_AVX_CRYPTO_CIPHERTEXTBYTES 226 +#define PQCLEAN_MCELIECE6960119_AVX_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE6960119_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE6960119_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE6960119_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/benes.c b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/benes.c new file mode 100644 index 0000000000..4d859b1087 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/benes.c @@ -0,0 +1,311 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_x(vec128 *data, vec128 *bits) { + int i; + vec128 v0, v1; + vec128 d; + + for (i = 0; i < 64; i += 2) { + v0 = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_low(data[i + 0], data[i + 1]); + v1 = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_high(data[i + 0], data[i + 1]); + + d = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(v0, v1); + d = PQCLEAN_MCELIECE6960119_AVX_vec128_and(d, *bits++); + v0 = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(v0, d); + v1 = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(v1, d); + + data[i + 0] = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_low(v0, v1); + data[i + 1] = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_high(v0, v1); + } +} + +static void layer_0(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x ], bs[ x + 1 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, *cond++); + bs[ x ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x ], diff); + bs[ x + 1 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 1 ], diff); + } +} + +static void layer_1(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 2 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 2 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 3 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 3 ], diff); + + cond += 2; + } +} + +static void layer_2(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 4 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 4 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 5 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 5 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[2]); + bs[ x + 2 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 2 ], diff); + bs[ x + 6 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 6 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[3]); + bs[ x + 3 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 3 ], diff); + bs[ x + 7 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ x + 7 ], diff); + + cond += 4; + } +} + +static void layer_3(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 8 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 8 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 9 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 9 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 10 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 10 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 11 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 11 ], diff); + + cond += 4; + } + } +} + +static void layer_4(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 16 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 16 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 17 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 17 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 18 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 18 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 19 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 19 ], diff); + + cond += 4; + } + } +} + +static void layer_5(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 32 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 32 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 33 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 33 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 34 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 34 ], diff); + + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); + diff = PQCLEAN_MCELIECE6960119_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 35 ] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(bs[ s + 35 ], diff); + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: bits_int, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE6960119_AVX_load_bits(vec128 bits_int[][32], const unsigned char *bits) { + int i, j; + const unsigned char *ptr = bits; + + vec128 buf[64]; + + for (i = 0; i <= 5; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(PQCLEAN_MCELIECE6960119_AVX_load8(ptr), PQCLEAN_MCELIECE6960119_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE6960119_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } + + for (i = 6; i <= 18; i++) { + for (j = 0; j < 32; j++) { + bits_int[i][j] = PQCLEAN_MCELIECE6960119_AVX_load16(ptr); + ptr += 16; + } + } + + for (i = 19; i < 25; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(PQCLEAN_MCELIECE6960119_AVX_load8(ptr), PQCLEAN_MCELIECE6960119_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE6960119_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } +} + +/* input: r, sequence of bits to be permuted */ +/* b, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE6960119_AVX_benes(vec128 *r, vec128 b[][32], int rev) { + int inc; + + vec128 *b_ptr; + + if (rev == 0) { + inc = 32; + b_ptr = b[ 0]; + } else { + inc = -32; + b_ptr = b[24]; + } + + // + + PQCLEAN_MCELIECE6960119_AVX_transpose_64x128_sp( r ); + + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE6960119_AVX_transpose_64x128_sp( r ); + + layer_x(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_x(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE6960119_AVX_transpose_64x128_sp( r ); + + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + //b_ptr += inc; + + PQCLEAN_MCELIECE6960119_AVX_transpose_64x128_sp( r ); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/benes.h b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/benes.h new file mode 100644 index 0000000000..0120309c02 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_BENES_H +#define PQCLEAN_MCELIECE6960119_AVX_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE6960119_AVX_load_bits(vec128 /*bits_int*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE6960119_AVX_benes(vec128 * /*r*/, vec128 /*b*/[][32], int /*rev*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/bm.c b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/bm.c new file mode 100644 index 0000000000..ea7e6bd081 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/bm.c @@ -0,0 +1,215 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "params.h" +#include "vec128.h" + +#include + +extern gf PQCLEAN_MCELIECE6960119_AVX_vec_reduce_asm(vec128 *); +extern void PQCLEAN_MCELIECE6960119_AVX_update_asm(void *, gf, int); + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline void vec128_cmov(vec128 out[][2], uint16_t mask) { + int i; + + vec128 v0, v1; + + vec128 m0 = PQCLEAN_MCELIECE6960119_AVX_vec128_set1_16b( mask); + vec128 m1 = PQCLEAN_MCELIECE6960119_AVX_vec128_set1_16b(~mask); + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE6960119_AVX_vec128_and(out[i][1], m0); + v1 = PQCLEAN_MCELIECE6960119_AVX_vec128_and(out[i][0], m1); + out[i][0] = PQCLEAN_MCELIECE6960119_AVX_vec128_or(v0, v1); + } +} + +static inline void interleave(vec256 *in, int idx0, int idx1, vec256 *mask, int b) { + int s = 1 << b; + + vec256 x, y; + + x = PQCLEAN_MCELIECE6960119_AVX_vec256_or(PQCLEAN_MCELIECE6960119_AVX_vec256_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE6960119_AVX_vec256_sll_4x(PQCLEAN_MCELIECE6960119_AVX_vec256_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE6960119_AVX_vec256_or(PQCLEAN_MCELIECE6960119_AVX_vec256_srl_4x(PQCLEAN_MCELIECE6960119_AVX_vec256_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE6960119_AVX_vec256_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec256 *in) { + int i, k; + + vec256 mask[4][2]; + vec256 buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE6960119_AVX_vec256_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE6960119_AVX_vec256_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE6960119_AVX_vec256_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE6960119_AVX_vec256_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE6960119_AVX_vec256_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE6960119_AVX_vec256_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE6960119_AVX_vec256_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE6960119_AVX_vec256_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE6960119_AVX_vec256_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE6960119_AVX_vec256_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE6960119_AVX_vec256_extract(buf[i], 1) >> (k * 16)) & GFMASK; + out[ (4 * 2 + k) * 16 + i ] = (PQCLEAN_MCELIECE6960119_AVX_vec256_extract(buf[i], 2) >> (k * 16)) & GFMASK; + out[ (4 * 3 + k) * 16 + i ] = (PQCLEAN_MCELIECE6960119_AVX_vec256_extract(buf[i], 3) >> (k * 16)) & GFMASK; + } + } +} + +typedef union { + vec128 as_128[GFBITS][2]; + vec256 as_256[GFBITS]; +} aligned_double_vec128; + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE6960119_AVX_bm(vec128 *out, vec256 *in) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1; + uint64_t v[2]; + + vec128 prod[ GFBITS ]; + vec128 interval[GFBITS]; + + aligned_double_vec128 db; + aligned_double_vec128 BC_tmp; + aligned_double_vec128 BC; + + gf d, b; + gf coefs[256]; + + // initialization + + get_coefs(coefs, in); + + BC.as_128[0][0] = PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0, one << 62); + BC.as_128[0][1] = PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0, one << 63); + + for (i = 1; i < GFBITS; i++) { + BC.as_128[i][0] = BC.as_128[i][1] = PQCLEAN_MCELIECE6960119_AVX_vec128_setzero(); + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[i] = PQCLEAN_MCELIECE6960119_AVX_vec128_setzero(); + } + + for (N = 0; N < SYS_T * 2; N++) { + PQCLEAN_MCELIECE6960119_AVX_update_asm(interval, coefs[N], 16); + PQCLEAN_MCELIECE6960119_AVX_vec128_mul_asm(prod, interval, BC.as_128[0] + 1, 32); + + d = PQCLEAN_MCELIECE6960119_AVX_vec_reduce_asm(prod); + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + db.as_128[i][0] = PQCLEAN_MCELIECE6960119_AVX_vec128_setbits((d >> i) & 1); + db.as_128[i][1] = PQCLEAN_MCELIECE6960119_AVX_vec128_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE6960119_AVX_vec256_mul(BC_tmp.as_256, db.as_256, BC.as_256); + + vec128_cmov(BC.as_128, mask); + PQCLEAN_MCELIECE6960119_AVX_update_asm(BC.as_128, 0, 32); + + for (i = 0; i < GFBITS; i++) { + BC.as_128[i][1] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(BC_tmp.as_128[i][0], BC_tmp.as_128[i][1]); + } + + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(BC.as_128[i][1], 0); + v[1] = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(BC.as_128[i][1], 1); + + out[i] = PQCLEAN_MCELIECE6960119_AVX_vec128_set2x((v[0] >> 8) | (v[1] << 56), v[1] >> 8); + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/bm.h b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/bm.h new file mode 100644 index 0000000000..b320500869 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/bm.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_BM_H +#define PQCLEAN_MCELIECE6960119_AVX_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "vec128.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE6960119_AVX_bm(vec128 * /*out*/, vec256 * /*in*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/consts.S b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/consts.S new file mode 100644 index 0000000000..e4abaf6185 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/consts.S @@ -0,0 +1,33 @@ +.data + +# not supported on MacOS +#.section .rodata + +.globl PQCLEAN_MCELIECE6960119_AVX_MASK0_0 +.globl PQCLEAN_MCELIECE6960119_AVX_MASK0_1 +.globl PQCLEAN_MCELIECE6960119_AVX_MASK1_0 +.globl PQCLEAN_MCELIECE6960119_AVX_MASK1_1 +.globl PQCLEAN_MCELIECE6960119_AVX_MASK2_0 +.globl PQCLEAN_MCELIECE6960119_AVX_MASK2_1 +.globl PQCLEAN_MCELIECE6960119_AVX_MASK3_0 +.globl PQCLEAN_MCELIECE6960119_AVX_MASK3_1 +.globl PQCLEAN_MCELIECE6960119_AVX_MASK4_0 +.globl PQCLEAN_MCELIECE6960119_AVX_MASK4_1 +.globl PQCLEAN_MCELIECE6960119_AVX_MASK5_0 +.globl PQCLEAN_MCELIECE6960119_AVX_MASK5_1 + +.p2align 5 + +PQCLEAN_MCELIECE6960119_AVX_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE6960119_AVX_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE6960119_AVX_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333, 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE6960119_AVX_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE6960119_AVX_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE6960119_AVX_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE6960119_AVX_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE6960119_AVX_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE6960119_AVX_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE6960119_AVX_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE6960119_AVX_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE6960119_AVX_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/consts.inc b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/consts.inc new file mode 100644 index 0000000000..1f8f716c29 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/consts.inc @@ -0,0 +1,502 @@ +//64 +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6666666666666666, 0X6666666666666666, 0X6666666666666666, 0X6666666666666666), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A) +}, +//128 +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33, 0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF) +}, +//256 +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0XF00F0FF00FF0F00F, 0XF00F0FF00FF0F00F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9669966969966996, 0X6996699696699669, 0X6996699696699669, 0X9669966969966996), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0X5AA5A55AA55A5AA5, 0X5AA5A55AA55A5AA5, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555) +}, +//512 +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9966669966999966, 0X6699996699666699, 0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9966996699669966, 0X6699669966996699, 0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6699996699666699, 0X9966669966999966, 0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6699669966996699, 0X9966996699669966, 0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +//1024 +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +//2048 +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +//4096 +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/controlbits.c b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/controlbits.c new file mode 100644 index 0000000000..da9d019351 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE6960119_AVX_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE6960119_AVX_sort_63b(n / 2, x); + PQCLEAN_MCELIECE6960119_AVX_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE6960119_AVX_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/controlbits.h b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/controlbits.h new file mode 100644 index 0000000000..8e184b4da3 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_CONTROLBITS_H +#define PQCLEAN_MCELIECE6960119_AVX_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE6960119_AVX_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE6960119_AVX_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/crypto_hash.h b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/crypto_hash.h new file mode 100644 index 0000000000..222688eea2 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE6960119_AVX_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/decrypt.c b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/decrypt.c new file mode 100644 index 0000000000..5bcd7e566e --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/decrypt.c @@ -0,0 +1,236 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec256 out[][GFBITS], vec256 inv[][GFBITS], const unsigned char *sk, vec256 *recv) { + int i, j; + + vec128 sk_int[ GFBITS ]; + vec256 eval[32][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + // computing inverses + + PQCLEAN_MCELIECE6960119_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE6960119_AVX_fft(eval, sk_int); + + for (i = 0; i < 32; i++) { + PQCLEAN_MCELIECE6960119_AVX_vec256_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE6960119_AVX_vec256_copy(inv[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE6960119_AVX_vec256_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6960119_AVX_vec256_inv(tmp, inv[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE6960119_AVX_vec256_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE6960119_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6960119_AVX_vec256_copy(inv[0], tmp); + + // + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE6960119_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + unsigned char r[ 1024 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + r[i - 1] &= (1 << ((GFBITS * SYS_T) % 8)) - 1; // throwing away redundant bits + + for (i = SYND_BYTES; i < 1024; i++) { + r[i] = 0; + } + + for (i = 0; i < 64; i++) { + recv[i] = PQCLEAN_MCELIECE6960119_AVX_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 64; i++) { + v[0] = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE6960119_AVX_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE6960119_AVX_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec256 out[][GFBITS], vec256 inv[][GFBITS], vec256 *recv) { + int i, j; + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE6960119_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 64; i++) { + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE6960119_AVX_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE6960119_AVX_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u64( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec256 *s0, vec256 *s1) { + int i; + vec256 diff; + + diff = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(s0[0], s1[0]); + + for (i = 1; i < GFBITS; i++) { + diff = PQCLEAN_MCELIECE6960119_AVX_vec256_or(diff, PQCLEAN_MCELIECE6960119_AVX_vec256_xor(s0[i], s1[i])); + } + + return (uint16_t)PQCLEAN_MCELIECE6960119_AVX_vec256_testz(diff); +} + +static void reformat_128to256(vec256 *out, vec128 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(in[2 * i + 0], 0); + v[1] = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(in[2 * i + 0], 1); + v[2] = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(in[2 * i + 1], 0); + v[3] = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(in[2 * i + 1], 1); + + out[i] = PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +static void reformat_256to128(vec128 *out, vec256 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i], 3); + + out[2 * i + 0] = PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(v[0], v[1]); + out[2 * i + 1] = PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(v[2], v[3]); + } +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE6960119_AVX_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec256 inv[ 32 ][ GFBITS ]; + vec256 scaled[ 32 ][ GFBITS ]; + vec256 eval[32][ GFBITS ]; + + vec128 error128[ 64 ]; + vec256 error256[ 32 ]; + + vec256 s_priv[ GFBITS ]; + vec256 s_priv_cmp[ GFBITS ]; + vec128 locator[ GFBITS ]; + + vec128 recv128[ 64 ]; + vec256 recv256[ 32 ]; + vec256 allone; + + vec128 bits_int[25][32]; + + // Berlekamp decoder + + preprocess(recv128, c); + + PQCLEAN_MCELIECE6960119_AVX_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE6960119_AVX_benes(recv128, bits_int, 1); + + reformat_128to256(recv256, recv128); + + scaling(scaled, inv, sk, recv256); + PQCLEAN_MCELIECE6960119_AVX_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE6960119_AVX_bm(locator, s_priv); + + PQCLEAN_MCELIECE6960119_AVX_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE6960119_AVX_vec256_set1_16b(0xFFFF); + + for (i = 0; i < 32; i++) { + error256[i] = PQCLEAN_MCELIECE6960119_AVX_vec256_or_reduce(eval[i]); + error256[i] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(error256[i], allone); + } + + scaling_inv(scaled, inv, error256); + PQCLEAN_MCELIECE6960119_AVX_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + reformat_256to128(error128, error256); + PQCLEAN_MCELIECE6960119_AVX_benes(error128, bits_int, 0); + + postprocess(e, error128); + + check_weight = weight_check(e, error128); + + return 1 - (check_synd & check_weight); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/decrypt.h b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/decrypt.h new file mode 100644 index 0000000000..c3dc844470 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_DECRYPT_H +#define PQCLEAN_MCELIECE6960119_AVX_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE6960119_AVX_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/encrypt.c b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/encrypt.c new file mode 100644 index 0000000000..aa63109abc --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/encrypt.c @@ -0,0 +1,104 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "int32_sort.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE6960119_AVX_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + int i, j, eq, count; + + uint16_t ind[ SYS_T * 2 ]; + int32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((unsigned char *) ind, sizeof(ind)); + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + PQCLEAN_MCELIECE6960119_AVX_int32_sort(ind32, SYS_T); + + eq = 0; + for (i = 1; i < SYS_T; i++) { + if (ind32[i - 1] == ind32[i]) { + eq = 1; + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE6960119_AVX_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE6960119_AVX_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + + PQCLEAN_MCELIECE6960119_AVX_syndrome_asm(s, pk, e); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/encrypt.h b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/encrypt.h new file mode 100644 index 0000000000..5bdf635f91 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_ENCRYPT_H +#define PQCLEAN_MCELIECE6960119_AVX_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE6960119_AVX_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/fft.c b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/fft.c new file mode 100644 index 0000000000..59ba1717ab --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/fft.c @@ -0,0 +1,262 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" +#include "vec256.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec128 *in) { + int i, j, k; + vec128 t; + uint64_t v0, v1; + + const vec128 mask[5][2] = { + { + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0x8888888888888888, 0x8888888888888888), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + } + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + v1 = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(in[i], 1); + v1 ^= v1 >> 32; + v0 = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(in[i], 0); + v0 ^= v1 << 32; + in[i] = PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(v0, v1); + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + t = PQCLEAN_MCELIECE6960119_AVX_vec128_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE6960119_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(in[i], t); + + t = PQCLEAN_MCELIECE6960119_AVX_vec128_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE6960119_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(in[i], t); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE6960119_AVX_vec128_mul(in, in, s[j]); // scaling + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec256 out[][ GFBITS ], vec128 *in) { + int i, j, k, s, b; + + vec128 tmp[ GFBITS ]; + vec256 tmp0[ GFBITS ]; + vec256 tmp1[ GFBITS ]; + vec128 t[ GFBITS ]; + + union { + vec128 v[8][ GFBITS + 1 ]; + vec256 V[8][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + uint64_t v0, v1; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 2; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; + + // boradcast + + for (j = 0; j < GFBITS; j++) { + t[j] = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_high(in[j], in[j]); + } + + for (i = 0; i < 8; i += 2) { + for (j = 0; j < GFBITS; j++) { + v0 = (beta[i + 0] >> j) & 1; + v0 = -v0; + v1 = (beta[i + 1] >> j) & 1; + v1 = -v1; + + tmp[j] = PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(v0, v1); + } + + PQCLEAN_MCELIECE6960119_AVX_vec128_mul(tmp, t, tmp); + + for (j = 0; j < GFBITS; j++) { + pre.v[i + 0][j] = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_low(tmp[j], tmp[j]); + pre.v[i + 1][j] = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_high(tmp[j], tmp[j]); + } + } + + for (i = 0; i < GFBITS; i += 2) { + if (i != GFBITS - 1) { + buf.v[0][1] = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_low(in[i + 1], in[i + 1] ^ pre.v[6][i + 1]); + } + buf.v[0][0] = PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_low(in[i + 0], in[i + 0] ^ pre.v[6][i + 0]); + + buf.V[1] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[0], pre.V[0][i / 2]); + buf.V[16] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[0], pre.V[4][i / 2]); + buf.V[3] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[1], pre.V[1][i / 2]); + buf.V[48] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[16], pre.V[5][i / 2]); + buf.V[49] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[48], pre.V[0][i / 2]); + buf.V[2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[0], pre.V[1][i / 2]); + buf.V[51] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[49], pre.V[1][i / 2]); + buf.V[6] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[2], pre.V[2][i / 2]); + buf.V[50] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[51], pre.V[0][i / 2]); + buf.V[7] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[6], pre.V[0][i / 2]); + buf.V[54] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[50], pre.V[2][i / 2]); + buf.V[5] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[7], pre.V[1][i / 2]); + buf.V[55] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[54], pre.V[0][i / 2]); + buf.V[53] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[55], pre.V[1][i / 2]); + buf.V[4] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[0], pre.V[2][i / 2]); + buf.V[52] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[53], pre.V[0][i / 2]); + buf.V[12] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[4], pre.V[3][i / 2]); + buf.V[60] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[52], pre.V[3][i / 2]); + buf.V[13] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[12], pre.V[0][i / 2]); + buf.V[61] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[60], pre.V[0][i / 2]); + buf.V[15] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[13], pre.V[1][i / 2]); + buf.V[63] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[61], pre.V[1][i / 2]); + buf.V[14] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[15], pre.V[0][i / 2]); + buf.V[62] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[63], pre.V[0][i / 2]); + buf.V[10] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[14], pre.V[2][i / 2]); + buf.V[58] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[62], pre.V[2][i / 2]); + buf.V[11] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[10], pre.V[0][i / 2]); + buf.V[59] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[58], pre.V[0][i / 2]); + buf.V[9] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[11], pre.V[1][i / 2]); + buf.V[57] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[59], pre.V[1][i / 2]); + buf.V[56] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[57], pre.V[0][i / 2]); + buf.V[8] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[0], pre.V[3][i / 2]); + buf.V[40] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[56], pre.V[4][i / 2]); + buf.V[24] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[8], pre.V[4][i / 2]); + buf.V[41] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[40], pre.V[0][i / 2]); + buf.V[25] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[24], pre.V[0][i / 2]); + buf.V[43] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[41], pre.V[1][i / 2]); + buf.V[27] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[25], pre.V[1][i / 2]); + buf.V[42] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[43], pre.V[0][i / 2]); + buf.V[26] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[27], pre.V[0][i / 2]); + buf.V[46] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[42], pre.V[2][i / 2]); + buf.V[30] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[26], pre.V[2][i / 2]); + buf.V[47] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[46], pre.V[0][i / 2]); + buf.V[31] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[30], pre.V[0][i / 2]); + buf.V[45] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[47], pre.V[1][i / 2]); + buf.V[29] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[31], pre.V[1][i / 2]); + buf.V[44] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[45], pre.V[0][i / 2]); + buf.V[28] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[29], pre.V[0][i / 2]); + buf.V[36] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[44], pre.V[3][i / 2]); + buf.V[20] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[28], pre.V[3][i / 2]); + buf.V[37] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[36], pre.V[0][i / 2]); + buf.V[21] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[20], pre.V[0][i / 2]); + buf.V[39] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[37], pre.V[1][i / 2]); + buf.V[23] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[21], pre.V[1][i / 2]); + buf.V[38] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[39], pre.V[0][i / 2]); + buf.V[22] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[23], pre.V[0][i / 2]); + buf.V[34] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[38], pre.V[2][i / 2]); + buf.V[18] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[22], pre.V[2][i / 2]); + buf.V[35] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[34], pre.V[0][i / 2]); + buf.V[19] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[18], pre.V[0][i / 2]); + buf.V[33] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[35], pre.V[1][i / 2]); + buf.V[17] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[19], pre.V[1][i / 2]); + buf.V[32] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[33], pre.V[0][i / 2]); + + // transpose + + PQCLEAN_MCELIECE6960119_AVX_transpose_64x256_sp(buf.V); + + for (j = 0; j < 32; j++) { + if (i != GFBITS - 1) { + out[j][i + 1] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_high(buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + out[j][i + 0] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_low (buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + } + + // butterflies + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + tmp0[b] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_low (out[k][b], out[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + tmp1[b] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_high (out[k][b], out[k + 1][b]); + } + + PQCLEAN_MCELIECE6960119_AVX_vec256_maa_asm(tmp0, tmp1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_low (tmp0[b], tmp1[b]); + } + for (b = 0; b < GFBITS; b++) { + out[k + 1][b] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_high (tmp0[b], tmp1[b]); + } + } + + for (i = 0; i <= 4; i++) { + s = 1 << i; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE6960119_AVX_vec256_maa_asm(out[k], out[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + consts_ptr += (1 << i); + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE6960119_AVX_fft(vec256 out[][GFBITS], vec128 *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/fft.h b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/fft.h new file mode 100644 index 0000000000..27f697413e --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/fft.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_FFT_H +#define PQCLEAN_MCELIECE6960119_AVX_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include "params.h" +#include "vec128.h" +#include "vec256.h" +#include + +void PQCLEAN_MCELIECE6960119_AVX_fft(vec256 /*out*/[][GFBITS], vec128 * /*in*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/fft_tr.c b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/fft_tr.c new file mode 100644 index 0000000000..b7fc243953 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/fft_tr.c @@ -0,0 +1,400 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec256 *in) { + int i, j, k; + vec256 t; + uint64_t v[4]; + + const vec256 mask[6][2] = { + { + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0x2222222222222222, 0x2222222222222222, 0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0x4444444444444444, 0x4444444444444444, 0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0x3030303030303030, 0x3030303030303030, 0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000) + }, + { + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF) + } + }; + + const vec256 s[6][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE6960119_AVX_vec256_mul(in, in, s[j]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + t = PQCLEAN_MCELIECE6960119_AVX_vec256_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE6960119_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(in[i], t); + + t = PQCLEAN_MCELIECE6960119_AVX_vec256_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE6960119_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(in[i], t); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i], 3); + + v[1] ^= v[0] >> 32; + v[1] ^= v[1] << 32; + v[3] ^= v[2] >> 32; + v[3] ^= v[3] << 32; + + in[i] = PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i], 3); + + v[2] ^= v[1]; + v[3] ^= v[2]; + + in[i] = PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + + } +} + +static void butterflies_tr(vec256 *out, vec256 in[][ GFBITS ]) { + int i, j, k, s, b; + + vec256 t0[ GFBITS ]; + vec256 t1[ GFBITS ]; + vec256 t; + + vec128 out128[ GFBITS ][ 2 ]; + vec128 tmp[ GFBITS ]; + + union { + vec128 v[6][ GFBITS + 1 ]; + vec256 V[6][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t v[4]; + uint64_t consts_ptr = 33; + + const unsigned char reversal[] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // butterflies + + for (i = 4; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE6960119_AVX_vec256_ama_asm(in[k], in[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + } + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_low(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_high(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE6960119_AVX_vec256_ama_asm(t0, t1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + in[k][b] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_low(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_high(t0[b], t1[b]); + } + + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_low_2x(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_high_2x(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE6960119_AVX_vec256_ama_asm(t0, t1, consts[0]); + + for (b = 0; b < GFBITS; b++) { + in[k + 0][b] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_low_2x(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_high_2x(t0[b], t1[b]); + } + } + + + // boradcast + + for (i = 0; i < GFBITS; i += 2) { + // transpose + + for (k = 0; k < 32; k++) { + if (i != GFBITS - 1) { + buf.v[ reversal[2 * k + 0] ][1] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract2x(in[ k ][i + 1], 0); + buf.v[ reversal[2 * k + 1] ][1] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract2x(in[ k ][i + 1], 1); + } + + buf.v[ reversal[2 * k + 0] ][0] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract2x(in[ k ][i + 0], 0); + buf.v[ reversal[2 * k + 1] ][0] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract2x(in[ k ][i + 0], 1); + } + + PQCLEAN_MCELIECE6960119_AVX_transpose_64x256_sp(buf.V); + + // + + pre.V[0][i / 2] = buf.V[32]; + buf.V[33] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[33], buf.V[32]); + pre.V[1][i / 2] = buf.V[33]; + buf.V[35] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[35], buf.V[33]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[35]); + buf.V[34] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[34], buf.V[35]); + pre.V[2][i / 2] = buf.V[34]; + buf.V[38] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[38], buf.V[34]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[38]); + buf.V[39] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[39], buf.V[38]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[39]); + buf.V[37] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[37], buf.V[39]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[37]); + buf.V[36] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[36], buf.V[37]); + pre.V[3][i / 2] = buf.V[36]; + buf.V[44] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[44], buf.V[36]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[44]); + buf.V[45] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[45], buf.V[44]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[45]); + buf.V[47] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[47], buf.V[45]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[47]); + buf.V[46] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[46], buf.V[47]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[2][i / 2], buf.V[46]); + buf.V[42] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[42], buf.V[46]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[42]); + buf.V[43] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[43], buf.V[42]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[43]); + buf.V[41] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[41], buf.V[43]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[41]); + buf.V[40] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[40], buf.V[41]); + pre.V[4][i / 2] = buf.V[40]; + buf.V[56] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[56], buf.V[40]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[56]); + buf.V[57] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[57], buf.V[56]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[57]); + buf.V[59] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[59], buf.V[57]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[59]); + buf.V[58] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[58], buf.V[59]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[2][i / 2], buf.V[58]); + buf.V[62] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[62], buf.V[58]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[62]); + buf.V[63] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[63], buf.V[62]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[63]); + buf.V[61] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[61], buf.V[63]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[61]); + buf.V[60] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[60], buf.V[61]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[3][i / 2], buf.V[60]); + buf.V[52] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[52], buf.V[60]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[52]); + buf.V[53] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[53], buf.V[52]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[53]); + buf.V[55] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[55], buf.V[53]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[55]); + buf.V[54] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[54], buf.V[55]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[2][i / 2], buf.V[54]); + buf.V[50] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[50], buf.V[54]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[50]); + buf.V[51] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[51], buf.V[50]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[51]); + buf.V[49] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[49], buf.V[51]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[49]); + buf.V[48] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[48], buf.V[49]); + pre.V[5][i / 2] = buf.V[48]; + buf.V[16] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[16], buf.V[48]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[16]); + buf.V[17] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[17], buf.V[16]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[17]); + buf.V[19] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[19], buf.V[17]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[19]); + buf.V[18] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[18], buf.V[19]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[2][i / 2], buf.V[18]); + buf.V[22] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[22], buf.V[18]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[22]); + buf.V[23] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[23], buf.V[22]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[23]); + buf.V[21] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[21], buf.V[23]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[21]); + buf.V[20] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[20], buf.V[21]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[3][i / 2], buf.V[20]); + buf.V[28] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[28], buf.V[20]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[28]); + buf.V[29] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[29], buf.V[28]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[29]); + buf.V[31] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[31], buf.V[29]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[31]); + buf.V[30] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[30], buf.V[31]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[2][i / 2], buf.V[30]); + buf.V[26] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[26], buf.V[30]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[26]); + buf.V[27] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[27], buf.V[26]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[27]); + buf.V[25] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[25], buf.V[27]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[25]); + buf.V[24] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[24], buf.V[25]); + pre.V[4][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[4][i / 2], buf.V[24]); + buf.V[8] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[8], buf.V[24]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[8]); + buf.V[9] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[9], buf.V[8]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[9]); + buf.V[11] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[11], buf.V[9]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[11]); + buf.V[10] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[10], buf.V[11]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[2][i / 2], buf.V[10]); + buf.V[14] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[14], buf.V[10]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[14]); + buf.V[15] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[15], buf.V[14]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[15]); + buf.V[13] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[13], buf.V[15]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[13]); + buf.V[12] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[12], buf.V[13]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[3][i / 2], buf.V[12]); + buf.V[4] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[4], buf.V[12]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[4]); + buf.V[5] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[5], buf.V[4]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[5]); + buf.V[7] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[7], buf.V[5]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[7]); + buf.V[6] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[6], buf.V[7]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[2][i / 2], buf.V[6]); + buf.V[2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[2], buf.V[6]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[2]); + buf.V[3] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[3], buf.V[2]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[1][i / 2], buf.V[3]); + buf.V[1] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[1], buf.V[3]); + + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(pre.V[0][i / 2], buf.V[1]); + t = PQCLEAN_MCELIECE6960119_AVX_vec256_xor(buf.V[0], buf.V[1]); + + if (i != GFBITS - 1) { + out128[i + 1][0] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract2x(t, 1); + } + out128[i + 0][0] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract2x(t, 0); + + } + + // + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6960119_AVX_vec128_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE6960119_AVX_vec128_mul(tmp, pre.v[0], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = tmp[b]; + } + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6960119_AVX_vec128_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE6960119_AVX_vec128_mul(tmp, pre.v[i], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(out128[b][1], tmp[b]); + } + } + + for (b = 0; b < GFBITS; b++) { + v[0] = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(out128[b][0], 0); + v[1] = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(out128[b][0], 1); + v[2] = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(out128[b][1], 0); + v[3] = PQCLEAN_MCELIECE6960119_AVX_vec128_extract(out128[b][1], 1); + + out[b] = PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +/* justifying the length of the output */ +static void postprocess(vec256 *out) { + int i; + uint64_t v[4]; + + for (i = 0; i < 13; i++) { + v[0] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(out[i], 0); + v[1] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(out[i], 1); + v[2] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(out[i], 2); + v[3] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(out[i], 3); + + v[3] <<= (128 - SYS_T) * 2; + v[3] >>= (128 - SYS_T) * 2; + + out[i] = PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + + +void PQCLEAN_MCELIECE6960119_AVX_fft_tr(vec256 *out, vec256 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); + + postprocess(out); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/fft_tr.h b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/fft_tr.h new file mode 100644 index 0000000000..091f30c4db --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_FFT_TR_H +#define PQCLEAN_MCELIECE6960119_AVX_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE6960119_AVX_fft_tr(vec256 * /*out*/, vec256 /*in*/[][ GFBITS ]); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/gf.c b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/gf.c new file mode 100644 index 0000000000..14a5d2bdc6 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/gf.c @@ -0,0 +1,203 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE6960119_AVX_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* field multiplication */ +gf PQCLEAN_MCELIECE6960119_AVX_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* square twice */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE6960119_AVX_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // ^11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // ^11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // ^111111111111 + + return gf_sqmul(out, num); // ^1111111111110 = ^-1 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE6960119_AVX_gf_inv(gf in) { + return PQCLEAN_MCELIECE6960119_AVX_gf_frac(in, ((gf) 1)); +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE6960119_AVX_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[237]; + + for (i = 0; i < 237; i++) { + prod[i] = 0; + } + + for (i = 0; i < 119; i++) { + for (j = 0; j < 119; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE6960119_AVX_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 236; i >= 119; i--) { + prod[i - 117] ^= PQCLEAN_MCELIECE6960119_AVX_gf_mul(prod[i], (gf) 6400); + prod[i - 119] ^= PQCLEAN_MCELIECE6960119_AVX_gf_mul(prod[i], (gf) 3134); + } + + for (i = 0; i < 119; i++) { + out[i] = prod[i]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/gf.h b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/gf.h new file mode 100644 index 0000000000..543e55ef4e --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_GF_H +#define PQCLEAN_MCELIECE6960119_AVX_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE6960119_AVX_gf_iszero(gf a); +gf PQCLEAN_MCELIECE6960119_AVX_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE6960119_AVX_gf_mul(gf in0, gf in1); +uint64_t PQCLEAN_MCELIECE6960119_AVX_gf_mul2(gf a, gf b0, gf b1); +gf PQCLEAN_MCELIECE6960119_AVX_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE6960119_AVX_gf_inv(gf in); + +void PQCLEAN_MCELIECE6960119_AVX_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/int32_sort.c b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/int32_sort.c new file mode 100644 index 0000000000..02087a3e4f --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/int32_sort.c @@ -0,0 +1,1211 @@ +#include "int32_sort.h" +#define int32 int32_t + +#include + +typedef __m256i int32x8; +#define int32x8_load(z) _mm256_loadu_si256((__m256i_u *) (z)) +#define int32x8_store(z,i) _mm256_storeu_si256((__m256i_u *) (z),(i)) +#define int32x8_min _mm256_min_epi32 +#define int32x8_max _mm256_max_epi32 + +#define int32x8_MINMAX(a,b) \ + do { \ + int32x8 c = int32x8_min(a, b); \ + (b) = int32x8_max(a,b); \ + (a) = c; \ + } while(0) + +static inline void int32_MINMAX(int32 *a, int32 *b) { + int32 ab = *b ^ *a; + int32 c = *b - *a; + c ^= ab & (c ^ *b); + c >>= 31; + c &= ab; + *a ^= c; + *b ^= c; +} + +static void minmax_vector(int32 *x, int32 *y, size_t n) { + if (n < 8) { + while (n > 0) { + int32_MINMAX(x, y); + ++x; + ++y; + --n; + } + return; + } + if (n & 7) { + int32x8 x0 = int32x8_load(x + n - 8); + int32x8 y0 = int32x8_load(y + n - 8); + int32x8_MINMAX(x0, y0); + int32x8_store(x + n - 8, x0); + int32x8_store(y + n - 8, y0); + n &= ~7; + } + do { + int32x8 x0 = int32x8_load(x); + int32x8 y0 = int32x8_load(y); + int32x8_MINMAX(x0, y0); + int32x8_store(x, x0); + int32x8_store(y, y0); + x += 8; + y += 8; + n -= 8; + } while (n); +} + +/* stages 8,4,2,1 of size-16 bitonic merging */ +static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) { + int32x8 b0, b1, c0, c1, mask; + + int32x8_MINMAX(x0, x1); + + b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + + int32x8_MINMAX(b0, b1); + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */ + + x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */ + x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */ + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + } + + int32x8_store(&x[0], x0); + int32x8_store(&x[8], x1); +} + +/* stages 64,32 of bitonic merging; n is multiple of 128 */ +static void int32_twostages_32(int32 *x, size_t n) { + size_t i; + + while (n > 0) { + for (i = 0; i < 32; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + 32]); + int32x8 x2 = int32x8_load(&x[i + 64]); + int32x8 x3 = int32x8_load(&x[i + 96]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 32], x1); + int32x8_store(&x[i + 64], x2); + int32x8_store(&x[i + 96], x3); + } + x += 128; + n -= 128; + } +} + +/* stages 4q,2q,q of bitonic merging */ +static size_t int32_threestages(int32 *x, size_t n, size_t q) { + size_t k, i; + + for (k = 0; k + 8 * q <= n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + + return k; +} + +/* n is a power of 2; n >= 8; if n == 8 then flagdown */ +// NOLINTNEXTLINE(google-readability-function-size) +static void int32_sort_2power(int32 *x, size_t n, int flagdown) { + size_t p, q, i, j, k; + int32x8 mask; + + if (n == 8) { + int32 x0 = x[0]; + int32 x1 = x[1]; + int32 x2 = x[2]; + int32 x3 = x[3]; + int32 x4 = x[4]; + int32 x5 = x[5]; + int32 x6 = x[6]; + int32 x7 = x[7]; + + /* odd-even sort instead of bitonic sort */ + + int32_MINMAX(&x1, &x0); + int32_MINMAX(&x3, &x2); + int32_MINMAX(&x2, &x0); + int32_MINMAX(&x3, &x1); + int32_MINMAX(&x2, &x1); + + int32_MINMAX(&x5, &x4); + int32_MINMAX(&x7, &x6); + int32_MINMAX(&x6, &x4); + int32_MINMAX(&x7, &x5); + int32_MINMAX(&x6, &x5); + + int32_MINMAX(&x4, &x0); + int32_MINMAX(&x6, &x2); + int32_MINMAX(&x4, &x2); + + int32_MINMAX(&x5, &x1); + int32_MINMAX(&x7, &x3); + int32_MINMAX(&x5, &x3); + + int32_MINMAX(&x2, &x1); + int32_MINMAX(&x4, &x3); + int32_MINMAX(&x6, &x5); + + x[0] = x0; + x[1] = x1; + x[2] = x2; + x[3] = x3; + x[4] = x4; + x[5] = x5; + x[6] = x6; + x[7] = x7; + return; + } + + if (n == 16) { + int32x8 x0, x1, b0, b1, c0, c1; + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + + mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1); + + x0 ^= mask; /* A01234567 */ + x1 ^= mask; /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + c0 ^= mask; + c1 ^= mask; + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + b0 ^= mask; + b1 ^= mask; + + c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */ + c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */ + b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + mask = _mm256_set1_epi32(-1); + if (flagdown) { + x1 ^= mask; + } else { + x0 ^= mask; + } + + merge16_finish(x, x0, x1, flagdown); + return; + } + + if (n == 32) { + int32x8 x0, x1, x2, x3; + + int32_sort_2power(x, 16, 1); + int32_sort_2power(x + 16, 16, 0); + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + x2 = int32x8_load(&x[16]); + x3 = int32x8_load(&x[24]); + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + } + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + + merge16_finish(x, x0, x1, flagdown); + merge16_finish(x + 16, x2, x3, flagdown); + return; + } + + p = n >> 3; + for (i = 0; i < p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x2 = int32x8_load(&x[i + 2 * p]); + int32x8 x4 = int32x8_load(&x[i + 4 * p]); + int32x8 x6 = int32x8_load(&x[i + 6 * p]); + + /* odd-even stage instead of bitonic stage */ + + int32x8_MINMAX(x4, x0); + int32x8_MINMAX(x6, x2); + int32x8_MINMAX(x2, x0); + int32x8_MINMAX(x6, x4); + int32x8_MINMAX(x2, x4); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 2 * p], x2); + int32x8_store(&x[i + 4 * p], x4); + int32x8_store(&x[i + 6 * p], x6); + + int32x8 x1 = int32x8_load(&x[i + p]); + int32x8 x3 = int32x8_load(&x[i + 3 * p]); + int32x8 x5 = int32x8_load(&x[i + 5 * p]); + int32x8 x7 = int32x8_load(&x[i + 7 * p]); + + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x5, x3); + + int32x8_store(&x[i + p], x1); + int32x8_store(&x[i + 3 * p], x3); + int32x8_store(&x[i + 5 * p], x5); + int32x8_store(&x[i + 7 * p], x7); + } + + if (n >= 128) { + int flip, flipflip; + + mask = _mm256_set1_epi32(-1); + + for (j = 0; j < n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 16]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 16], x1); + } + + p = 8; + for (;;) { /* for p in [8, 16, ..., n/16] */ + q = p >> 1; + while (q >= 128) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + if (q == 64) { + int32_twostages_32(x, n); + q = 16; + } + if (q == 32) { + q = 8; + for (k = 0; k < n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q = 4; + } + if (q == 16) { + q = 8; + for (k = 0; k < n; k += 4 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q = 4; + } + if (q == 8) { + for (k = 0; k < n; k += q + q) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + } + } + + q = n >> 3; + flip = 0; + if (p << 1 == q) { + flip = 1; + } + flipflip = 1 - flip; + for (j = 0; j < q; j += p + p) { + for (k = j; k < j + p + p; k += p) { + for (i = k; i < k + p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + if (flip) { + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + x4 ^= mask; + x5 ^= mask; + x6 ^= mask; + x7 ^= mask; + } + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + flip ^= 1; + } + flip ^= flipflip; + } + + if (p << 4 == n) { + break; + } + p <<= 1; + } + } + + for (p = 4; p >= 1; p >>= 1) { + int32 *z = x; + int32 *target = x + n; + if (p == 4) { + mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&z[0], x0); + int32x8_store(&z[8], x1); + z += 16; + } + } else if (p == 2) { + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8_MINMAX(b0, b1); + int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20); + int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31); + int32x8_store(&z[0], c0); + int32x8_store(&z[8], c1); + z += 16; + } + } else { /* p == 1 */ + mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */ + int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */ + int32x8_MINMAX(d0, d1); + int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20); + int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31); + int32x8_store(&z[0], e0); + int32x8_store(&z[8], e1); + z += 16; + } + } + + q = n >> 4; + while (q >= 128 || q == 32) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (k = j; k < j + q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += 2 * q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (k = 0; k < q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + int32x8 x4 = int32x8_load(&x[k + 4 * q]); + int32x8 x5 = int32x8_load(&x[k + 5 * q]); + int32x8 x6 = int32x8_load(&x[k + 6 * q]); + int32x8 x7 = int32x8_load(&x[k + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + int32x8_store(&x[k + 4 * q], x4); + int32x8_store(&x[k + 5 * q], x5); + int32x8_store(&x[k + 6 * q], x6); + int32x8_store(&x[k + 7 * q], x7); + } + } + + /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */ + mask = _mm256_set1_epi32(-1); + + for (i = 0; i < n; i += 64) { + int32x8 a0 = int32x8_load(&x[i]); + int32x8 a1 = int32x8_load(&x[i + 8]); + int32x8 a2 = int32x8_load(&x[i + 16]); + int32x8 a3 = int32x8_load(&x[i + 24]); + int32x8 a4 = int32x8_load(&x[i + 32]); + int32x8 a5 = int32x8_load(&x[i + 40]); + int32x8 a6 = int32x8_load(&x[i + 48]); + int32x8 a7 = int32x8_load(&x[i + 56]); + + int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */ + int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */ + int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */ + int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */ + int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */ + int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */ + int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */ + + if (flagdown) { + c2 ^= mask; + c3 ^= mask; + c6 ^= mask; + c7 ^= mask; + } else { + c0 ^= mask; + c1 ^= mask; + c4 ^= mask; + c5 ^= mask; + } + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */ + int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */ + int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */ + + int32x8_MINMAX(d0, d1); + int32x8_MINMAX(d2, d3); + int32x8_MINMAX(d4, d5); + int32x8_MINMAX(d6, d7); + int32x8_MINMAX(d0, d2); + int32x8_MINMAX(d1, d3); + int32x8_MINMAX(d4, d6); + int32x8_MINMAX(d5, d7); + int32x8_MINMAX(d0, d4); + int32x8_MINMAX(d1, d5); + int32x8_MINMAX(d2, d6); + int32x8_MINMAX(d3, d7); + + int32x8 e0 = _mm256_unpacklo_epi32(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi32(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi32(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi32(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi32(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi32(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi32(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi32(d6, d7); + + int32x8 f0 = _mm256_unpacklo_epi64(e0, e2); + int32x8 f1 = _mm256_unpacklo_epi64(e1, e3); + int32x8 f2 = _mm256_unpackhi_epi64(e0, e2); + int32x8 f3 = _mm256_unpackhi_epi64(e1, e3); + int32x8 f4 = _mm256_unpacklo_epi64(e4, e6); + int32x8 f5 = _mm256_unpacklo_epi64(e5, e7); + int32x8 f6 = _mm256_unpackhi_epi64(e4, e6); + int32x8 f7 = _mm256_unpackhi_epi64(e5, e7); + + int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20); + int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20); + int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20); + int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20); + int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31); + int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31); + int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31); + int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31); + + int32x8_store(&x[i], g0); + int32x8_store(&x[i + 8], g1); + int32x8_store(&x[i + 16], g2); + int32x8_store(&x[i + 24], g3); + int32x8_store(&x[i + 32], g4); + int32x8_store(&x[i + 40], g5); + int32x8_store(&x[i + 48], g6); + int32x8_store(&x[i + 56], g7); + } + + q = n >> 4; + while (q >= 128 || q == 32) { + q >>= 2; + for (j = 0; j < n; j += 8 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q >>= 1; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += q + q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (i = 0; i < q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */ + int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */ + int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */ + int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */ + int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */ + int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */ + int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */ + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */ + int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */ + int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */ + + if (flagdown) { + d0 ^= mask; + d1 ^= mask; + d2 ^= mask; + d3 ^= mask; + d4 ^= mask; + d5 ^= mask; + d6 ^= mask; + d7 ^= mask; + } + + int32x8_store(&x[i], d0); + int32x8_store(&x[i + q], d4); + int32x8_store(&x[i + 2 * q], d1); + int32x8_store(&x[i + 3 * q], d5); + int32x8_store(&x[i + 4 * q], d2); + int32x8_store(&x[i + 5 * q], d6); + int32x8_store(&x[i + 6 * q], d3); + int32x8_store(&x[i + 7 * q], d7); + } +} + +void PQCLEAN_MCELIECE6960119_AVX_int32_sort(int32 *x, size_t n) { + size_t q, i, j; + + if (n <= 8) { + if (n == 8) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + int32_MINMAX(&x[6], &x[7]); + } + if (n >= 7) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + } + if (n >= 6) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + } + if (n >= 5) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + } + if (n >= 4) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + } + if (n >= 3) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + } + if (n >= 2) { + int32_MINMAX(&x[0], &x[1]); + } + return; + } + + if (!(n & (n - 1))) { + int32_sort_2power(x, n, 0); + return; + } + + q = 8; + while (q < n - q) { + q += q; + } + /* n > q >= 8 */ + + if (q <= 128) { /* n <= 256 */ + int32x8 y[32]; + for (i = q >> 3; i < q >> 2; ++i) { + y[i] = _mm256_set1_epi32(0x7fffffff); + } + for (i = 0; i < n; ++i) { + ((int32 *)y)[i] = x[i]; + } + int32_sort_2power((int32 *) y, 2 * q, 0); + for (i = 0; i < n; ++i) { + x[i] = ((int32 *) y)[i]; + } + return; + } + + int32_sort_2power(x, q, 1); + PQCLEAN_MCELIECE6960119_AVX_int32_sort(x + q, n - q); + + while (q >= 64) { + q >>= 2; + j = int32_threestages(x, n, q); + minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j); + if (j + 4 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + j += 4 * q; + } + minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j); + if (j + 2 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + } + j += 2 * q; + } + minmax_vector(x + j, x + j + q, n - q - j); + q >>= 1; + } + if (q == 32) { + j = 0; + for (; j + 64 <= n; j += 64) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8 x4 = int32x8_load(&x[j + 32]); + int32x8 x5 = int32x8_load(&x[j + 40]); + int32x8 x6 = int32x8_load(&x[j + 48]); + int32x8 x7 = int32x8_load(&x[j + 56]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20); + int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31); + int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20); + int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8_MINMAX(a4, a5); + int32x8_MINMAX(a6, a7); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20); + int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31); + int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20); + int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8 c4 = _mm256_unpacklo_epi64(b4, b5); + int32x8 c5 = _mm256_unpackhi_epi64(b4, b5); + int32x8 c6 = _mm256_unpacklo_epi64(b6, b7); + int32x8 c7 = _mm256_unpackhi_epi64(b6, b7); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8_MINMAX(c4, c5); + int32x8_MINMAX(c6, c7); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 d4 = _mm256_unpacklo_epi32(c4, c5); + int32x8 d5 = _mm256_unpackhi_epi32(c4, c5); + int32x8 d6 = _mm256_unpacklo_epi32(c6, c7); + int32x8 d7 = _mm256_unpackhi_epi32(c6, c7); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi64(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi64(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi64(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi64(d6, d7); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8_MINMAX(e4, e5); + int32x8_MINMAX(e6, e7); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8 f4 = _mm256_unpacklo_epi32(e4, e5); + int32x8 f5 = _mm256_unpackhi_epi32(e4, e5); + int32x8 f6 = _mm256_unpacklo_epi32(e6, e7); + int32x8 f7 = _mm256_unpackhi_epi32(e6, e7); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + int32x8_store(&x[j + 32], f4); + int32x8_store(&x[j + 40], f5); + int32x8_store(&x[j + 48], f6); + int32x8_store(&x[j + 56], f7); + } + minmax_vector(x + j, x + j + 32, n - 32 - j); + goto continue16; + } + if (q == 16) { + j = 0; +continue16: + for (; j + 32 <= n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + } + minmax_vector(x + j, x + j + 16, n - 16 - j); + goto continue8; + } + /* q == 8 */ + j = 0; +continue8: + for (; j + 16 <= n; j += 16) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 8], x1); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */ + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */ + int32x8_MINMAX(a0, a1); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */ + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */ + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */ + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */ + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */ + int32x8_MINMAX(e0, e1); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */ + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */ + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + } + minmax_vector(x + j, x + j + 8, n - 8 - j); + if (j + 8 <= n) { + int32_MINMAX(&x[j], &x[j + 4]); + int32_MINMAX(&x[j + 1], &x[j + 5]); + int32_MINMAX(&x[j + 2], &x[j + 6]); + int32_MINMAX(&x[j + 3], &x[j + 7]); + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + int32_MINMAX(&x[j + 4], &x[j + 6]); + int32_MINMAX(&x[j + 5], &x[j + 7]); + int32_MINMAX(&x[j + 4], &x[j + 5]); + int32_MINMAX(&x[j + 6], &x[j + 7]); + j += 8; + } + minmax_vector(x + j, x + j + 4, n - 4 - j); + if (j + 4 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + j += 4; + } + if (j + 3 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + } + if (j + 2 <= n) { + int32_MINMAX(&x[j], &x[j + 1]); + } +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/int32_sort.h b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/int32_sort.h new file mode 100644 index 0000000000..1bb75c3c67 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/int32_sort.h @@ -0,0 +1,9 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_INT32_SORT_H +#define PQCLEAN_MCELIECE6960119_AVX_INT32_SORT_H + +#include +#include + +void PQCLEAN_MCELIECE6960119_AVX_int32_sort(int32_t *x, size_t n); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/operations.c b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/operations.c new file mode 100644 index 0000000000..68577b49e7 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE6960119_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE6960119_AVX_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE6960119_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE6960119_AVX_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE6960119_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE6960119_AVX_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE6960119_AVX_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE6960119_AVX_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE6960119_AVX_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE6960119_AVX_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE6960119_AVX_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE6960119_AVX_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE6960119_AVX_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/params.h b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/params.h new file mode 100644 index 0000000000..e50eff288c --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_PARAMS_H +#define PQCLEAN_MCELIECE6960119_AVX_PARAMS_H + +#define GFBITS 13 +#define SYS_N 6960 +#define SYS_T 119 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/pk_gen.c b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/pk_gen.c new file mode 100644 index 0000000000..df0b63ad88 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/pk_gen.c @@ -0,0 +1,292 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +static void de_bitslicing(uint64_t *out, vec256 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 256 + 0 * 64 + r] <<= 1; + out[i * 256 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 256 + 1 * 64 + r] <<= 1; + out[i * 256 + 1 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i][j], 2); + for (r = 0; r < 64; r++) { + out[i * 256 + 2 * 64 + r] <<= 1; + out[i * 256 + 2 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(in[i][j], 3); + for (r = 0; r < 64; r++) { + out[i * 256 + 3 * 64 + r] <<= 1; + out[i * 256 + 3 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec256 out0[][GFBITS], vec256 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[4] = {0}; + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + } +} + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 255) / 256) +#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) +#define NBLOCKS2_I ((GFBITS * SYS_T + 255) / 256) +int PQCLEAN_MCELIECE6960119_AVX_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + const int block_idx = NBLOCKS1_I - 1; + int tail = (GFBITS * SYS_T) % 64; + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 4 ]; + uint64_t ops[ GFBITS * SYS_T ][ NBLOCKS1_I ]; + + uint64_t mask; + + vec128 sk_int[ GFBITS ]; + + vec256 consts[ 32 ][ GFBITS ]; + vec256 eval[ 32 ][ GFBITS ]; + vec256 prod[ 32 ][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + uint64_t one_row[ 128 ]; + + // compute the inverses + + PQCLEAN_MCELIECE6960119_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE6960119_AVX_fft(eval, sk_int); + + PQCLEAN_MCELIECE6960119_AVX_vec256_copy(prod[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE6960119_AVX_vec256_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6960119_AVX_vec256_inv(tmp, prod[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE6960119_AVX_vec256_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE6960119_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6960119_AVX_vec256_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE6960119_AVX_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_I; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_I; j++) { + PQCLEAN_MCELIECE6960119_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + // gaussian elimination to obtain an upper triangular matrix + // and keep track of the operations in ops + + for (i = 0; i < PK_NROWS; i++) { + for (j = 0; j < NBLOCKS1_I; j++) { + ops[ i ][ j ] = 0; + } + } + + for (i = 0; i < PK_NROWS; i++) { + ops[ i ][ i / 64 ] = 1; + ops[ i ][ i / 64 ] <<= (i % 64); + } + + uint64_t column[ PK_NROWS ]; + + for (i = 0; i < PK_NROWS; i++) { + column[i] = mat[ i ][ block_idx ]; + } + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_I; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + ops[ row ][ c ] ^= ops[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_I; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // computing the lineaer map required to obatin the systematic form + + for (row = PK_NROWS - 1; row >= 0; row--) { + for (k = 0; k < row; k++) { + mask = mat[ k ][ row / 64 ] >> (row & 63); + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_I; c++) { + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + + // apply the linear map to the non-systematic part + + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE6960119_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6960119_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + + for (i = 0; i < PK_NROWS; i++) { + mat[ i ][ block_idx ] = column[i]; + } + + for (row = 0; row < PK_NROWS; row++) { + for (k = 0; k < NBLOCKS1_H; k++) { + one_row[ k ] = 0; + } + + for (c = 0; c < PK_NROWS; c++) { + mask = ops[ row ][ c >> 6 ] >> (c & 63); + mask &= 1; + mask = -mask; + + for (k = block_idx; k < NBLOCKS1_H; k++) { + one_row[ k ] ^= mat[ c ][ k ] & mask; + } + } + + for (k = block_idx; k < NBLOCKS1_H - 1; k++) { + one_row[k] = (one_row[k] >> tail) | (one_row[k + 1] << (64 - tail)); + PQCLEAN_MCELIECE6960119_AVX_store8(pk, one_row[k]); + pk += 8; + } + + one_row[k] >>= tail; + PQCLEAN_MCELIECE6960119_AVX_store_i(pk, one_row[k], PK_ROW_BYTES % 8); + + pk[ (PK_ROW_BYTES % 8) - 1 ] &= (1 << (PK_NCOLS % 8)) - 1; // removing redundant bits + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/pk_gen.h b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/pk_gen.h new file mode 100644 index 0000000000..dc4baa39e5 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_PK_GEN_H +#define PQCLEAN_MCELIECE6960119_AVX_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE6960119_AVX_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/scalars_2x.inc b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/scalars_2x.inc new file mode 100644 index 0000000000..69496ccf9c --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/scalars_2x.inc @@ -0,0 +1,75 @@ +{ + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000) +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/scalars_4x.inc b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/scalars_4x.inc new file mode 100644 index 0000000000..998eb2f17b --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/scalars_4x.inc @@ -0,0 +1,91 @@ +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330, 0XF0F30C33CF03F03F, 0X3F30CC0C000F3FCC), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C, 0X00F30FC00C3300FF, 0XFC3CF030FC3FFF03), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC, 0XF3CC3CF3F3FCF33F, 0X33FFFCFF0CCF3CC3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30, 0X3C0FC0FC303C3F3C, 0X003CFF33C3CC30CF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC, 0XFC30CF303F3FF00F, 0XCFF3CF33C00F3003), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F, 0X33300C0CC3300CF3, 0X00F3CC0CF3003CCF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF, 0X3C030CF3F03FF3F3, 0X3C000CFCCC3C3333), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3, 0X3CCC03FCCC3FFC03, 0XF3CF03C0FCF03FF0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030, 0X033C3C3CF0003FC3, 0X3F3C3CF0C330330C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC, 0XFFC0FF00F0FF0F03, 0X33CCFCC0FF0033F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3, 0XF3F30CF003FCC303, 0X33C300C0F0C003F3), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF, 0X30CFCFC3CC0F3000, 0X003FF0003F00C00C), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33, 0X0CF30CCF3FCFCC0F, 0XCFF3C3033F030FFF) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00, 0X0F0F00FF0FF0FFFF, 0XFF0F0F00F000F0FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF, 0XF000F0F00F00FF0F, 0X0FFFFFFFFF00000F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF, 0X000FFFF0FFF0FF0F, 0XF0FFFF000F00F0FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F, 0X00F00FFF00000FF0, 0X0F0000F00FFF0FFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFF0000F0FF000F0, 0XFF000000FFF00000, 0XFFFFF0000FFFF00F, 0X0F0F0F00FF0F000F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0, 0XFFF0FFF0000FFFF0, 0X000F0F0FFFF0F000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F, 0XF0F0F0000F0F0F00, 0XF0FFFF0F00F0FF0F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000, 0X00F000F0F00FFF00, 0X0F0F000F0F00F0FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00F000F0FFF00F00, 0X00F00F00F00F000F, 0XF0FF0F0FFF00F0FF, 0X0000F0FF00FF0F0F), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0, 0XF0FF0FFFF0F0F0FF, 0X00FFFF0FF0FFF0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF, 0X00FFFFFFFFFFFFF0, 0X0000000F00F0FFF0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF, 0X00FFF0F0FF000F0F, 0XF0F00000FF00F0F0), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00, 0X000FFFF0000FFF00, 0X0F0F0FFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00FF0000000000FF, 0X00FF00FF00FF0000, 0XFFFF00FF00FF00FF, 0XFF0000FFFFFF00FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF, 0X00FFFF000000FF00, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FF00FF0000, 0X0000FFFF000000FF, 0XFFFF00FFFFFFFF00, 0XFFFF000000FFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF000000FF0000, 0X00FFFF00FF000000, 0X0000FFFF00FFFFFF, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF, 0X00FF0000FF0000FF, 0XFFFFFF00FFFFFF00), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00, 0XFFFF0000FF00FFFF, 0X00FFFF00FFFF00FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00, 0XFF000000FFFFFF00, 0X0000FFFF00FF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF, 0X000000000000FFFF, 0X000000FFFF000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00FFFFFFFF00FF00, 0X0000FF0000000000, 0XFF00FF00FFFF0000, 0XFF00FF0000FF00FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF000000000000, 0XFF00FFFF00FFFF00, 0XFFFF00FFFF00FFFF, 0X00FF0000000000FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00FF0000FF000000, 0X00FFFF00000000FF, 0XFFFFFFFFFF00FF00, 0XFF00FFFF00FF00FF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF, 0XFFFF00FFFF0000FF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000, 0X0000FF00000000FF, 0X0000FF000000FFFF) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X000000000000FFFF, 0X0000FFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000FFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF, 0XFFFFFFFF00000000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFF000000000000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000000000000000, 0X0000FFFF00000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000, 0X00000000FFFF0000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFF0000FFFF0000, 0X0000FFFFFFFFFFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0X0000FFFFFFFFFFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFFFFFF00000000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF, 0X000000000000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF000000000000, 0X00000000FFFF0000, 0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFFFFFF0000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFF0000FFFF, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFF0000FFFFFFFF, 0X0000FFFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000) +}, +{ + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF) +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/sk_gen.c b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/sk_gen.c new file mode 100644 index 0000000000..ebc5aa4e8c --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE6960119_AVX_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE6960119_AVX_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE6960119_AVX_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE6960119_AVX_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE6960119_AVX_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE6960119_AVX_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE6960119_AVX_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE6960119_AVX_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/sk_gen.h b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/sk_gen.h new file mode 100644 index 0000000000..6a45a26f39 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_SK_GEN_H +#define PQCLEAN_MCELIECE6960119_AVX_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE6960119_AVX_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE6960119_AVX_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/syndrome_asm.S b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/syndrome_asm.S new file mode 100644 index 0000000000..26b117a941 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/syndrome_asm.S @@ -0,0 +1,921 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: reg256 pp + +# qhasm: reg256 ee + +# qhasm: reg256 ss + +# qhasm: int64 b0 + +# qhasm: int64 b1 + +# qhasm: int64 i + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: int64 tmp + +# qhasm: stack64 back + +# qhasm: int64 buf_ptr + +# qhasm: stack256 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6960119_AVX_syndrome_asm +.global PQCLEAN_MCELIECE6960119_AVX_syndrome_asm +_PQCLEAN_MCELIECE6960119_AVX_syndrome_asm: +PQCLEAN_MCELIECE6960119_AVX_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $64,%r11 +sub %r11,%rsp + +# qhasm: input_2 += 193 +# asm 1: add $193,tmp=int64#4 +# asm 2: movzbq 0(tmp=%rcx +movzbq 0(%rdx),%rcx + +# qhasm: back = tmp +# asm 1: movq back=stack64#1 +# asm 2: movq back=32(%rsp) +movq %rcx,32(%rsp) + +# qhasm: i = 0 +# asm 1: mov $0,>i=int64#4 +# asm 2: mov $0,>i=%rcx +mov $0,%rcx + +# qhasm: inner1: +._inner1: + +# qhasm: addr = input_2 + i +# asm 1: lea (addr=int64#5 +# asm 2: lea (addr=%r8 +lea (%rdx,%rcx),%r8 + +# qhasm: b0 = *(uint8 *) (addr + 0) +# asm 1: movzbq 0(b0=int64#6 +# asm 2: movzbq 0(b0=%r9 +movzbq 0(%r8),%r9 + +# qhasm: b1 = *(uint8 *) (addr + 1) +# asm 1: movzbq 1(b1=int64#7 +# asm 2: movzbq 1(b1=%rax +movzbq 1(%r8),%rax + +# qhasm: (uint64) b0 >>= 3 +# asm 1: shr $3,b0=int64#4 +# asm 2: movzbq 1(b0=%rcx +movzbq 1(%r8),%rcx + +# qhasm: (uint64) b0 >>= 3 +# asm 1: shr $3,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 1547 +# asm 1: mov $1547,>row=int64#5 +# asm 2: mov $1547,>row=%r8 +mov $1547,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(ee=reg256#2 +# asm 2: vmovupd 0(ee=%ymm1 +vmovupd 0(%rdx),%ymm1 + +# qhasm: ss &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 32(pp=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(ee=reg256#3 +# asm 2: vmovupd 32(ee=%ymm2 +vmovupd 32(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 64(pp=%ymm1 +vmovupd 64(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(ee=reg256#3 +# asm 2: vmovupd 64(ee=%ymm2 +vmovupd 64(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 96(pp=%ymm1 +vmovupd 96(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 96 ] +# asm 1: vmovupd 96(ee=reg256#3 +# asm 2: vmovupd 96(ee=%ymm2 +vmovupd 96(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 128(pp=%ymm1 +vmovupd 128(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 128 ] +# asm 1: vmovupd 128(ee=reg256#3 +# asm 2: vmovupd 128(ee=%ymm2 +vmovupd 128(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 160(pp=%ymm1 +vmovupd 160(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 160 ] +# asm 1: vmovupd 160(ee=reg256#3 +# asm 2: vmovupd 160(ee=%ymm2 +vmovupd 160(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 192(pp=%ymm1 +vmovupd 192(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 192 ] +# asm 1: vmovupd 192(ee=reg256#3 +# asm 2: vmovupd 192(ee=%ymm2 +vmovupd 192(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 224(pp=%ymm1 +vmovupd 224(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 224 ] +# asm 1: vmovupd 224(ee=reg256#3 +# asm 2: vmovupd 224(ee=%ymm2 +vmovupd 224(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 256(pp=%ymm1 +vmovupd 256(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 256 ] +# asm 1: vmovupd 256(ee=reg256#3 +# asm 2: vmovupd 256(ee=%ymm2 +vmovupd 256(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 288(pp=%ymm1 +vmovupd 288(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 288 ] +# asm 1: vmovupd 288(ee=reg256#3 +# asm 2: vmovupd 288(ee=%ymm2 +vmovupd 288(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 320(pp=%ymm1 +vmovupd 320(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 320 ] +# asm 1: vmovupd 320(ee=reg256#3 +# asm 2: vmovupd 320(ee=%ymm2 +vmovupd 320(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 352(pp=%ymm1 +vmovupd 352(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 352 ] +# asm 1: vmovupd 352(ee=reg256#3 +# asm 2: vmovupd 352(ee=%ymm2 +vmovupd 352(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 384(pp=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 384 ] +# asm 1: vmovupd 384(ee=reg256#3 +# asm 2: vmovupd 384(ee=%ymm2 +vmovupd 384(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 416(pp=%ymm1 +vmovupd 416(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 416 ] +# asm 1: vmovupd 416(ee=reg256#3 +# asm 2: vmovupd 416(ee=%ymm2 +vmovupd 416(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 448(pp=%ymm1 +vmovupd 448(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 448 ] +# asm 1: vmovupd 448(ee=reg256#3 +# asm 2: vmovupd 448(ee=%ymm2 +vmovupd 448(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 480(pp=%ymm1 +vmovupd 480(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 480 ] +# asm 1: vmovupd 480(ee=reg256#3 +# asm 2: vmovupd 480(ee=%ymm2 +vmovupd 480(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 512(pp=%ymm1 +vmovupd 512(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 512 ] +# asm 1: vmovupd 512(ee=reg256#3 +# asm 2: vmovupd 512(ee=%ymm2 +vmovupd 512(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 544(pp=%ymm1 +vmovupd 544(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 544 ] +# asm 1: vmovupd 544(ee=reg256#3 +# asm 2: vmovupd 544(ee=%ymm2 +vmovupd 544(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 576(pp=%ymm1 +vmovupd 576(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 576 ] +# asm 1: vmovupd 576(ee=reg256#3 +# asm 2: vmovupd 576(ee=%ymm2 +vmovupd 576(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 608(pp=%ymm1 +vmovupd 608(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 608 ] +# asm 1: vmovupd 608(ee=reg256#3 +# asm 2: vmovupd 608(ee=%ymm2 +vmovupd 608(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 640(pp=%ymm1 +vmovupd 640(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 640 ] +# asm 1: vmovupd 640(ee=reg256#3 +# asm 2: vmovupd 640(ee=%ymm2 +vmovupd 640(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand buf=stack256#1 +# asm 2: vmovapd buf=0(%rsp) +vmovapd %ymm0,0(%rsp) + +# qhasm: s = *(uint32 *) (input_1 + 672) +# asm 1: movl 672(s=int64#6d +# asm 2: movl 672(s=%r9d +movl 672(%rsi),%r9d + +# qhasm: e = *(uint32 *) (input_2 + 672) +# asm 1: movl 672(e=int64#7d +# asm 2: movl 672(e=%eax +movl 672(%rdx),%eax + +# qhasm: s &= e +# asm 1: and p=int64#7 +# asm 2: movzbq 676(p=%rax +movzbq 676(%rsi),%rax + +# qhasm: e = *(uint8 *) (input_2 + 676) +# asm 1: movzbq 676(e=int64#8 +# asm 2: movzbq 676(e=%r10 +movzbq 676(%rdx),%r10 + +# qhasm: p &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 16(b64=%rax +movq 16(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 24(b64=%rax +movq 24(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,i=int64#2 +# asm 2: mov $676,>i=%rsi +mov $676,%rsi + +# qhasm: inner2: +._inner2: + +# qhasm: i -= 1 +# asm 1: sub $1,addr=int64#4 +# asm 2: lea (addr=%rcx +lea (%rdx,%rsi),%rcx + +# qhasm: b0 = *(uint8 *) (addr + 0) +# asm 1: movzbq 0(b0=int64#5 +# asm 2: movzbq 0(b0=%r8 +movzbq 0(%rcx),%r8 + +# qhasm: b1 = *(uint8 *) (addr + 1) +# asm 1: movzbq 1(b1=int64#6 +# asm 2: movzbq 1(b1=%r9 +movzbq 1(%rcx),%r9 + +# qhasm: (uint64) b0 >>= 5 +# asm 1: shr $5,tmp=int64#2 +# asm 2: movq tmp=%rsi +movq 32(%rsp),%rsi + +# qhasm: *(uint8 *) (input_2 + 0) = tmp +# asm 1: movb i=int64#2 +# asm 2: mov $0,>i=%rsi +mov $0,%rsi + +# qhasm: inner3: +._inner3: + +# qhasm: s = *(uint8 *) (input_0 + 0) +# asm 1: movzbq 0(s=int64#4 +# asm 2: movzbq 0(s=%rcx +movzbq 0(%rdi),%rcx + +# qhasm: e = *(uint8 *) (input_2 + 0) +# asm 1: movzbq 0(e=int64#5 +# asm 2: movzbq 0(e=%r8 +movzbq 0(%rdx),%r8 + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movzbq 0(s=%rsi +movzbq 0(%rdi),%rsi + +# qhasm: e = *(uint8 *) (input_2 + 0) +# asm 1: movzbq 0(e=int64#3 +# asm 2: movzbq 0(e=%rdx +movzbq 0(%rdx),%rdx + +# qhasm: (uint32) e &= 7 +# asm 1: and $7,mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE6960119_AVX_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE6960119_AVX_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE6960119_AVX_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE6960119_AVX_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE6960119_AVX_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE6960119_AVX_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE6960119_AVX_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE6960119_AVX_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE6960119_AVX_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE6960119_AVX_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE6960119_AVX_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE6960119_AVX_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE6960119_AVX_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE6960119_AVX_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE6960119_AVX_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE6960119_AVX_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE6960119_AVX_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE6960119_AVX_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE6960119_AVX_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE6960119_AVX_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE6960119_AVX_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE6960119_AVX_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE6960119_AVX_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK5_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK5_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE6960119_AVX_MASK5_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK5_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK5_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK5_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE6960119_AVX_MASK4_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK4_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK4_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK4_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE6960119_AVX_MASK4_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK4_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK4_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK4_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE6960119_AVX_MASK3_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK3_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK3_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK3_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE6960119_AVX_MASK3_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK3_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK3_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK3_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(x1=reg256#8 +# asm 2: vmovupd 256(x1=%ymm7 +vmovupd 256(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 512 ] +# asm 1: vmovupd 512(x2=reg256#9 +# asm 2: vmovupd 512(x2=%ymm8 +vmovupd 512(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 768 ] +# asm 1: vmovupd 768(x3=reg256#10 +# asm 2: vmovupd 768(x3=%ymm9 +vmovupd 768(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1024 ] +# asm 1: vmovupd 1024(x4=reg256#11 +# asm 2: vmovupd 1024(x4=%ymm10 +vmovupd 1024(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1280 ] +# asm 1: vmovupd 1280(x5=reg256#12 +# asm 2: vmovupd 1280(x5=%ymm11 +vmovupd 1280(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1536 ] +# asm 1: vmovupd 1536(x6=reg256#13 +# asm 2: vmovupd 1536(x6=%ymm12 +vmovupd 1536(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1792 ] +# asm 1: vmovupd 1792(x7=reg256#14 +# asm 2: vmovupd 1792(x7=%ymm13 +vmovupd 1792(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 32(x0=%ymm6 +vmovupd 32(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x2=reg256#9 +# asm 2: vmovupd 544(x2=%ymm8 +vmovupd 544(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x3=reg256#10 +# asm 2: vmovupd 800(x3=%ymm9 +vmovupd 800(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x4=reg256#11 +# asm 2: vmovupd 1056(x4=%ymm10 +vmovupd 1056(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x5=reg256#12 +# asm 2: vmovupd 1312(x5=%ymm11 +vmovupd 1312(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x6=reg256#13 +# asm 2: vmovupd 1568(x6=%ymm12 +vmovupd 1568(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x7=reg256#14 +# asm 2: vmovupd 1824(x7=%ymm13 +vmovupd 1824(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 32 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 64(x0=%ymm6 +vmovupd 64(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x1=reg256#8 +# asm 2: vmovupd 320(x1=%ymm7 +vmovupd 320(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x3=reg256#10 +# asm 2: vmovupd 832(x3=%ymm9 +vmovupd 832(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x4=reg256#11 +# asm 2: vmovupd 1088(x4=%ymm10 +vmovupd 1088(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x5=reg256#12 +# asm 2: vmovupd 1344(x5=%ymm11 +vmovupd 1344(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x6=reg256#13 +# asm 2: vmovupd 1600(x6=%ymm12 +vmovupd 1600(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x7=reg256#14 +# asm 2: vmovupd 1856(x7=%ymm13 +vmovupd 1856(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 64 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 96(x0=%ymm6 +vmovupd 96(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x1=reg256#8 +# asm 2: vmovupd 352(x1=%ymm7 +vmovupd 352(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x2=reg256#9 +# asm 2: vmovupd 608(x2=%ymm8 +vmovupd 608(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x4=reg256#11 +# asm 2: vmovupd 1120(x4=%ymm10 +vmovupd 1120(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x5=reg256#12 +# asm 2: vmovupd 1376(x5=%ymm11 +vmovupd 1376(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x6=reg256#13 +# asm 2: vmovupd 1632(x6=%ymm12 +vmovupd 1632(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x7=reg256#14 +# asm 2: vmovupd 1888(x7=%ymm13 +vmovupd 1888(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 96 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 128(x0=%ymm6 +vmovupd 128(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x1=reg256#8 +# asm 2: vmovupd 384(x1=%ymm7 +vmovupd 384(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x2=reg256#9 +# asm 2: vmovupd 640(x2=%ymm8 +vmovupd 640(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x3=reg256#10 +# asm 2: vmovupd 896(x3=%ymm9 +vmovupd 896(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x5=reg256#12 +# asm 2: vmovupd 1408(x5=%ymm11 +vmovupd 1408(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x6=reg256#13 +# asm 2: vmovupd 1664(x6=%ymm12 +vmovupd 1664(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x7=reg256#14 +# asm 2: vmovupd 1920(x7=%ymm13 +vmovupd 1920(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 128 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 160(x0=%ymm6 +vmovupd 160(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x1=reg256#8 +# asm 2: vmovupd 416(x1=%ymm7 +vmovupd 416(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x2=reg256#9 +# asm 2: vmovupd 672(x2=%ymm8 +vmovupd 672(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x3=reg256#10 +# asm 2: vmovupd 928(x3=%ymm9 +vmovupd 928(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x4=reg256#11 +# asm 2: vmovupd 1184(x4=%ymm10 +vmovupd 1184(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x6=reg256#13 +# asm 2: vmovupd 1696(x6=%ymm12 +vmovupd 1696(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x7=reg256#14 +# asm 2: vmovupd 1952(x7=%ymm13 +vmovupd 1952(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 160 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 192(x0=%ymm6 +vmovupd 192(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x1=reg256#8 +# asm 2: vmovupd 448(x1=%ymm7 +vmovupd 448(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x2=reg256#9 +# asm 2: vmovupd 704(x2=%ymm8 +vmovupd 704(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x3=reg256#10 +# asm 2: vmovupd 960(x3=%ymm9 +vmovupd 960(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x4=reg256#11 +# asm 2: vmovupd 1216(x4=%ymm10 +vmovupd 1216(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x5=reg256#12 +# asm 2: vmovupd 1472(x5=%ymm11 +vmovupd 1472(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x7=reg256#14 +# asm 2: vmovupd 1984(x7=%ymm13 +vmovupd 1984(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 192 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 224(x0=%ymm6 +vmovupd 224(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x1=reg256#8 +# asm 2: vmovupd 480(x1=%ymm7 +vmovupd 480(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x2=reg256#9 +# asm 2: vmovupd 736(x2=%ymm8 +vmovupd 736(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x3=reg256#10 +# asm 2: vmovupd 992(x3=%ymm9 +vmovupd 992(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x4=reg256#11 +# asm 2: vmovupd 1248(x4=%ymm10 +vmovupd 1248(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x5=reg256#12 +# asm 2: vmovupd 1504(x5=%ymm11 +vmovupd 1504(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x6=reg256#13 +# asm 2: vmovupd 1760(x6=%ymm12 +vmovupd 1760(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#1 +# asm 2: vpand v00=%ymm0 +vpand %ymm9,%ymm0,%ymm0 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#13 +# asm 2: vpsllq $32,v10=%ymm12 +vpsllq $32,%ymm13,%ymm12 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm0,%ymm12,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#13 +# asm 2: vpslld $16,v10=%ymm12 +vpslld $16,%ymm11,%ymm12 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#14 +# asm 2: vpsrld $16,v01=%ymm13 +vpsrld $16,%ymm14,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm0,%ymm13 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm8,%ymm13 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#3 +# asm 2: vpand v00=%ymm2 +vpand %ymm7,%ymm2,%ymm2 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#9 +# asm 2: vpslld $16,v10=%ymm8 +vpslld $16,%ymm1,%ymm8 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm2,%ymm8,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#8 +# asm 2: vpsllw $8,v10=%ymm7 +vpsllw $8,%ymm12,%ymm7 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#9 +# asm 2: vpsrlw $8,v01=%ymm8 +vpsrlw $8,%ymm9,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#10 +# asm 2: vpsllw $8,v10=%ymm9 +vpsllw $8,%ymm0,%ymm9 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#12 +# asm 2: vpsllw $8,v10=%ymm11 +vpsllw $8,%ymm2,%ymm11 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#11 +# asm 2: vpsrlw $8,v01=%ymm10 +vpsrlw $8,%ymm10,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#5 +# asm 2: vpand v00=%ymm4 +vpand %ymm6,%ymm4,%ymm4 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#11 +# asm 2: vpsllw $8,v10=%ymm10 +vpsllw $8,%ymm1,%ymm10 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm4,%ymm10,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 224 ] = x0 +# asm 1: vmovupd mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK2_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK2_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE6960119_AVX_MASK2_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK2_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK2_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK2_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE6960119_AVX_MASK1_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK1_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK1_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK1_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE6960119_AVX_MASK1_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK1_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK1_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK1_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE6960119_AVX_MASK0_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK0_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK0_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK0_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE6960119_AVX_MASK0_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK0_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK0_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE6960119_AVX_MASK0_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(x1=reg256#8 +# asm 2: vmovupd 32(x1=%ymm7 +vmovupd 32(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(x2=reg256#9 +# asm 2: vmovupd 64(x2=%ymm8 +vmovupd 64(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(x3=reg256#10 +# asm 2: vmovupd 96(x3=%ymm9 +vmovupd 96(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(x4=reg256#11 +# asm 2: vmovupd 128(x4=%ymm10 +vmovupd 128(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(x5=reg256#12 +# asm 2: vmovupd 160(x5=%ymm11 +vmovupd 160(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(x6=reg256#13 +# asm 2: vmovupd 192(x6=%ymm12 +vmovupd 192(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(x7=reg256#14 +# asm 2: vmovupd 224(x7=%ymm13 +vmovupd 224(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 256(x0=%ymm6 +vmovupd 256(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x2=reg256#9 +# asm 2: vmovupd 320(x2=%ymm8 +vmovupd 320(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x3=reg256#10 +# asm 2: vmovupd 352(x3=%ymm9 +vmovupd 352(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x4=reg256#11 +# asm 2: vmovupd 384(x4=%ymm10 +vmovupd 384(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x5=reg256#12 +# asm 2: vmovupd 416(x5=%ymm11 +vmovupd 416(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x6=reg256#13 +# asm 2: vmovupd 448(x6=%ymm12 +vmovupd 448(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x7=reg256#14 +# asm 2: vmovupd 480(x7=%ymm13 +vmovupd 480(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 256 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 512(x0=%ymm6 +vmovupd 512(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x1=reg256#8 +# asm 2: vmovupd 544(x1=%ymm7 +vmovupd 544(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x3=reg256#10 +# asm 2: vmovupd 608(x3=%ymm9 +vmovupd 608(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x4=reg256#11 +# asm 2: vmovupd 640(x4=%ymm10 +vmovupd 640(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x5=reg256#12 +# asm 2: vmovupd 672(x5=%ymm11 +vmovupd 672(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x6=reg256#13 +# asm 2: vmovupd 704(x6=%ymm12 +vmovupd 704(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x7=reg256#14 +# asm 2: vmovupd 736(x7=%ymm13 +vmovupd 736(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 512 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 768(x0=%ymm6 +vmovupd 768(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x1=reg256#8 +# asm 2: vmovupd 800(x1=%ymm7 +vmovupd 800(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x2=reg256#9 +# asm 2: vmovupd 832(x2=%ymm8 +vmovupd 832(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x4=reg256#11 +# asm 2: vmovupd 896(x4=%ymm10 +vmovupd 896(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x5=reg256#12 +# asm 2: vmovupd 928(x5=%ymm11 +vmovupd 928(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x6=reg256#13 +# asm 2: vmovupd 960(x6=%ymm12 +vmovupd 960(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x7=reg256#14 +# asm 2: vmovupd 992(x7=%ymm13 +vmovupd 992(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 768 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1024(x0=%ymm6 +vmovupd 1024(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x1=reg256#8 +# asm 2: vmovupd 1056(x1=%ymm7 +vmovupd 1056(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x2=reg256#9 +# asm 2: vmovupd 1088(x2=%ymm8 +vmovupd 1088(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x3=reg256#10 +# asm 2: vmovupd 1120(x3=%ymm9 +vmovupd 1120(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x5=reg256#12 +# asm 2: vmovupd 1184(x5=%ymm11 +vmovupd 1184(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x6=reg256#13 +# asm 2: vmovupd 1216(x6=%ymm12 +vmovupd 1216(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x7=reg256#14 +# asm 2: vmovupd 1248(x7=%ymm13 +vmovupd 1248(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1024 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1280(x0=%ymm6 +vmovupd 1280(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x1=reg256#8 +# asm 2: vmovupd 1312(x1=%ymm7 +vmovupd 1312(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x2=reg256#9 +# asm 2: vmovupd 1344(x2=%ymm8 +vmovupd 1344(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x3=reg256#10 +# asm 2: vmovupd 1376(x3=%ymm9 +vmovupd 1376(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x4=reg256#11 +# asm 2: vmovupd 1408(x4=%ymm10 +vmovupd 1408(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x6=reg256#13 +# asm 2: vmovupd 1472(x6=%ymm12 +vmovupd 1472(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x7=reg256#14 +# asm 2: vmovupd 1504(x7=%ymm13 +vmovupd 1504(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1280 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1536(x0=%ymm6 +vmovupd 1536(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x1=reg256#8 +# asm 2: vmovupd 1568(x1=%ymm7 +vmovupd 1568(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x2=reg256#9 +# asm 2: vmovupd 1600(x2=%ymm8 +vmovupd 1600(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x3=reg256#10 +# asm 2: vmovupd 1632(x3=%ymm9 +vmovupd 1632(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x4=reg256#11 +# asm 2: vmovupd 1664(x4=%ymm10 +vmovupd 1664(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x5=reg256#12 +# asm 2: vmovupd 1696(x5=%ymm11 +vmovupd 1696(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x7=reg256#14 +# asm 2: vmovupd 1760(x7=%ymm13 +vmovupd 1760(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1536 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1792(x0=%ymm6 +vmovupd 1792(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x1=reg256#8 +# asm 2: vmovupd 1824(x1=%ymm7 +vmovupd 1824(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x2=reg256#9 +# asm 2: vmovupd 1856(x2=%ymm8 +vmovupd 1856(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x3=reg256#10 +# asm 2: vmovupd 1888(x3=%ymm9 +vmovupd 1888(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x4=reg256#11 +# asm 2: vmovupd 1920(x4=%ymm10 +vmovupd 1920(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x5=reg256#12 +# asm 2: vmovupd 1952(x5=%ymm11 +vmovupd 1952(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x6=reg256#13 +# asm 2: vmovupd 1984(x6=%ymm12 +vmovupd 1984(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#1 +# asm 2: vpand v10=%ymm0 +vpand %ymm13,%ymm0,%ymm0 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm12,%ymm0,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#13 +# asm 2: vpand v10=%ymm12 +vpand %ymm11,%ymm2,%ymm12 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm14,%ymm3,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm0,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm8,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#3 +# asm 2: vpand v10=%ymm2 +vpand %ymm1,%ymm2,%ymm2 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm8,%ymm2,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#8 +# asm 2: vpand v10=%ymm7 +vpand %ymm12,%ymm4,%ymm7 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm9,%ymm5,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#10 +# asm 2: vpand v10=%ymm9 +vpand %ymm0,%ymm4,%ymm9 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#12 +# asm 2: vpand v10=%ymm11 +vpand %ymm2,%ymm4,%ymm11 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm4,%ymm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#5 +# asm 2: vpand v10=%ymm4 +vpand %ymm1,%ymm4,%ymm4 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm10,%ymm4,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 1792 ] = x0 +# asm 1: vmovupd +#include + +void PQCLEAN_MCELIECE6960119_AVX_uint32_sort(uint32_t *x, size_t n); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/update_asm.S b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/update_asm.S new file mode 100644 index 0000000000..9ec1b6e9d1 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/update_asm.S @@ -0,0 +1,576 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 s0 + +# qhasm: int64 s1 + +# qhasm: int64 s2 + +# qhasm: enter update_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6960119_AVX_update_asm +.global PQCLEAN_MCELIECE6960119_AVX_update_asm +_PQCLEAN_MCELIECE6960119_AVX_update_asm: +PQCLEAN_MCELIECE6960119_AVX_update_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: s2 = input_1 +# asm 1: mov s2=int64#2 +# asm 2: mov s2=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1, + +void PQCLEAN_MCELIECE6960119_AVX_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE6960119_AVX_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE6960119_AVX_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE6960119_AVX_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE6960119_AVX_irr_load(vec128 *out, const unsigned char *in) { + int i, j; + uint64_t v[2]; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE6960119_AVX_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + v[0] = v[1] = 0; + + for (j = 63; j >= 0; j--) { + v[0] <<= 1; + v[0] |= (irr[j] >> i) & 1; + } + for (j = SYS_T; j >= 64; j--) { + v[1] <<= 1; + v[1] |= (irr[j] >> i) & 1; + } + + out[i] = PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(v[0], v[1]); + } +} + +void PQCLEAN_MCELIECE6960119_AVX_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE6960119_AVX_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +vec128 PQCLEAN_MCELIECE6960119_AVX_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE6960119_AVX_vec128_set2x( PQCLEAN_MCELIECE6960119_AVX_load8(in), PQCLEAN_MCELIECE6960119_AVX_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE6960119_AVX_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE6960119_AVX_store8(out + 0, PQCLEAN_MCELIECE6960119_AVX_vec128_extract(in, 0)); + PQCLEAN_MCELIECE6960119_AVX_store8(out + 8, PQCLEAN_MCELIECE6960119_AVX_vec128_extract(in, 1)); +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/util.h b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/util.h new file mode 100644 index 0000000000..37480980a5 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/util.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_UTIL_H +#define PQCLEAN_MCELIECE6960119_AVX_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE6960119_AVX_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE6960119_AVX_store2(unsigned char *dest, uint16_t a); +uint16_t PQCLEAN_MCELIECE6960119_AVX_load2(const unsigned char *src); +uint32_t PQCLEAN_MCELIECE6960119_AVX_load4(const unsigned char *src); +void PQCLEAN_MCELIECE6960119_AVX_irr_load(vec128 *out, const unsigned char *in); +void PQCLEAN_MCELIECE6960119_AVX_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE6960119_AVX_load8(const unsigned char *in); +vec128 PQCLEAN_MCELIECE6960119_AVX_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE6960119_AVX_store16(unsigned char *out, vec128 in); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/vec128.c b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/vec128.c new file mode 100644 index 0000000000..2aca35449b --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/vec128.c @@ -0,0 +1,79 @@ +#include "vec128.h" + +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE6960119_AVX_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE6960119_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE6960119_AVX_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE6960119_AVX_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6960119_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE6960119_AVX_vec128_mul_asm(h, f, g, 16); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/vec128.h b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/vec128.h new file mode 100644 index 0000000000..479f20dac6 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/vec128.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_MCELIECE6960119_AVX_VEC128_H +#define PQCLEAN_MCELIECE6960119_AVX_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE6960119_AVX_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE6960119_AVX_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE6960119_AVX_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE6960119_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE6960119_AVX_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE6960119_AVX_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6960119_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/vec128_mul_asm.S b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/vec128_mul_asm.S new file mode 100644 index 0000000000..a2f9e198f9 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/vec128_mul_asm.S @@ -0,0 +1,1816 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 b2 + +# qhasm: reg256 b3 + +# qhasm: reg256 b4 + +# qhasm: reg256 b5 + +# qhasm: reg256 b6 + +# qhasm: reg256 b7 + +# qhasm: reg256 b8 + +# qhasm: reg256 b9 + +# qhasm: reg256 b10 + +# qhasm: reg256 b11 + +# qhasm: reg256 b12 + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: reg128 h0 + +# qhasm: reg128 h1 + +# qhasm: reg128 h2 + +# qhasm: reg128 h3 + +# qhasm: reg128 h4 + +# qhasm: reg128 h5 + +# qhasm: reg128 h6 + +# qhasm: reg128 h7 + +# qhasm: reg128 h8 + +# qhasm: reg128 h9 + +# qhasm: reg128 h10 + +# qhasm: reg128 h11 + +# qhasm: reg128 h12 + +# qhasm: reg128 h13 + +# qhasm: reg128 h14 + +# qhasm: reg128 h15 + +# qhasm: reg128 h16 + +# qhasm: reg128 h17 + +# qhasm: reg128 h18 + +# qhasm: reg128 h19 + +# qhasm: reg128 h20 + +# qhasm: reg128 h21 + +# qhasm: reg128 h22 + +# qhasm: reg128 h23 + +# qhasm: reg128 h24 + +# qhasm: stack4864 buf + +# qhasm: int64 ptr + +# qhasm: int64 tmp + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6960119_AVX_vec128_mul_asm +.global PQCLEAN_MCELIECE6960119_AVX_vec128_mul_asm +_PQCLEAN_MCELIECE6960119_AVX_vec128_mul_asm: +PQCLEAN_MCELIECE6960119_AVX_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $608,%r11 +sub %r11,%rsp + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#5 +# asm 2: leaq ptr=%r8 +leaq 0(%rsp),%r8 + +# qhasm: tmp = input_3 +# asm 1: mov tmp=int64#6 +# asm 2: mov tmp=%r9 +mov %rcx,%r9 + +# qhasm: tmp *= 12 +# asm 1: imulq $12,tmp=int64#6 +# asm 2: imulq $12,tmp=%r9 +imulq $12,%r9,%r9 + +# qhasm: input_2 += tmp +# asm 1: add b12=reg256#1 +# asm 2: vbroadcasti128 0(b12=%ymm0 +vbroadcasti128 0(%rdx), %ymm0 + +# qhasm: input_2 -= input_3 +# asm 1: sub a6=reg256#2 +# asm 2: vpxor a6=%ymm1 +vpxor %ymm1,%ymm1,%ymm1 + +# qhasm: a6[0] = mem128[ input_1 + 96 ] +# asm 1: vinsertf128 $0x0,96(r18=reg256#3 +# asm 2: vpand r18=%ymm2 +vpand %ymm0,%ymm1,%ymm2 + +# qhasm: mem256[ ptr + 576 ] = r18 +# asm 1: vmovupd r17=reg256#4 +# asm 2: vpand r17=%ymm3 +vpand %ymm0,%ymm2,%ymm3 + +# qhasm: a4[0] = mem128[ input_1 + 64 ] +# asm 1: vinsertf128 $0x0,64(r16=reg256#6 +# asm 2: vpand r16=%ymm5 +vpand %ymm0,%ymm4,%ymm5 + +# qhasm: a3[0] = mem128[ input_1 + 48 ] +# asm 1: vinsertf128 $0x0,48(r15=reg256#8 +# asm 2: vpand r15=%ymm7 +vpand %ymm0,%ymm6,%ymm7 + +# qhasm: a2[0] = mem128[ input_1 + 32 ] +# asm 1: vinsertf128 $0x0,32(r14=reg256#10 +# asm 2: vpand r14=%ymm9 +vpand %ymm0,%ymm8,%ymm9 + +# qhasm: a1[0] = mem128[ input_1 + 16 ] +# asm 1: vinsertf128 $0x0,16(r13=reg256#12 +# asm 2: vpand r13=%ymm11 +vpand %ymm0,%ymm10,%ymm11 + +# qhasm: a0[0] = mem128[ input_1 + 0 ] +# asm 1: vinsertf128 $0x0,0(r12=reg256#1 +# asm 2: vpand r12=%ymm0 +vpand %ymm0,%ymm12,%ymm0 + +# qhasm: b11 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b11=reg256#14 +# asm 2: vbroadcasti128 0(b11=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r12 ^= r +# asm 1: vpxor r11=reg256#4 +# asm 2: vpand r11=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b10 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b10=reg256#14 +# asm 2: vbroadcasti128 0(b10=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r11 ^= r +# asm 1: vpxor r10=reg256#6 +# asm 2: vpand r10=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b9 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b9=reg256#14 +# asm 2: vbroadcasti128 0(b9=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r10 ^= r +# asm 1: vpxor r9=reg256#8 +# asm 2: vpand r9=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b8 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b8=reg256#14 +# asm 2: vbroadcasti128 0(b8=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r9 ^= r +# asm 1: vpxor r8=reg256#10 +# asm 2: vpand r8=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b7 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b7=reg256#14 +# asm 2: vbroadcasti128 0(b7=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r8 ^= r +# asm 1: vpxor r7=reg256#12 +# asm 2: vpand r7=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b6 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b6=reg256#14 +# asm 2: vbroadcasti128 0(b6=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r6=reg256#1 +# asm 2: vpand r6=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: b5 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b5=reg256#14 +# asm 2: vbroadcasti128 0(b5=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r6 ^= r +# asm 1: vpxor r5=reg256#4 +# asm 2: vpand r5=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b4 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b4=reg256#14 +# asm 2: vbroadcasti128 0(b4=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#6 +# asm 2: vpand r4=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b3 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b3=reg256#14 +# asm 2: vbroadcasti128 0(b3=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#8 +# asm 2: vpand r3=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b2 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b2=reg256#14 +# asm 2: vbroadcasti128 0(b2=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#10 +# asm 2: vpand r2=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b1 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b1=reg256#14 +# asm 2: vbroadcasti128 0(b1=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#12 +# asm 2: vpand r1=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b0 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b0=reg256#14 +# asm 2: vbroadcasti128 0(b0=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#1 +# asm 2: vpand r0=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: mem256[ ptr + 160 ] = r5 +# asm 1: vmovupd h24=reg128#1 +# asm 2: movdqu 560(h24=%xmm0 +movdqu 560(%r8),%xmm0 + +# qhasm: h11 = h24 +# asm 1: movdqa h11=reg128#2 +# asm 2: movdqa h11=%xmm1 +movdqa %xmm0,%xmm1 + +# qhasm: h12 = h24 +# asm 1: movdqa h12=reg128#3 +# asm 2: movdqa h12=%xmm2 +movdqa %xmm0,%xmm2 + +# qhasm: h14 = h24 +# asm 1: movdqa h14=reg128#4 +# asm 2: movdqa h14=%xmm3 +movdqa %xmm0,%xmm3 + +# qhasm: h15 = h24 +# asm 1: movdqa h15=reg128#1 +# asm 2: movdqa h15=%xmm0 +movdqa %xmm0,%xmm0 + +# qhasm: h23 = mem128[ ptr + 528 ] +# asm 1: movdqu 528(h23=reg128#5 +# asm 2: movdqu 528(h23=%xmm4 +movdqu 528(%r8),%xmm4 + +# qhasm: h10 = h23 +# asm 1: movdqa h10=reg128#6 +# asm 2: movdqa h10=%xmm5 +movdqa %xmm4,%xmm5 + +# qhasm: h11 = h11 ^ h23 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm4,%xmm1,%xmm1 + +# qhasm: h13 = h23 +# asm 1: movdqa h13=reg128#7 +# asm 2: movdqa h13=%xmm6 +movdqa %xmm4,%xmm6 + +# qhasm: h14 = h14 ^ h23 +# asm 1: vpxor h14=reg128#4 +# asm 2: vpxor h14=%xmm3 +vpxor %xmm4,%xmm3,%xmm3 + +# qhasm: h22 = mem128[ ptr + 496 ] +# asm 1: movdqu 496(h22=reg128#5 +# asm 2: movdqu 496(h22=%xmm4 +movdqu 496(%r8),%xmm4 + +# qhasm: h9 = h22 +# asm 1: movdqa h9=reg128#8 +# asm 2: movdqa h9=%xmm7 +movdqa %xmm4,%xmm7 + +# qhasm: h10 = h10 ^ h22 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm4,%xmm5,%xmm5 + +# qhasm: h12 = h12 ^ h22 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm4,%xmm2,%xmm2 + +# qhasm: h13 = h13 ^ h22 +# asm 1: vpxor h13=reg128#5 +# asm 2: vpxor h13=%xmm4 +vpxor %xmm4,%xmm6,%xmm4 + +# qhasm: h21 = mem128[ ptr + 464 ] +# asm 1: movdqu 464(h21=reg128#7 +# asm 2: movdqu 464(h21=%xmm6 +movdqu 464(%r8),%xmm6 + +# qhasm: h8 = h21 +# asm 1: movdqa h8=reg128#9 +# asm 2: movdqa h8=%xmm8 +movdqa %xmm6,%xmm8 + +# qhasm: h9 = h9 ^ h21 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h11 = h11 ^ h21 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h12 = h12 ^ h21 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm6,%xmm2,%xmm2 + +# qhasm: h20 = mem128[ ptr + 432 ] +# asm 1: movdqu 432(h20=reg128#7 +# asm 2: movdqu 432(h20=%xmm6 +movdqu 432(%r8),%xmm6 + +# qhasm: h7 = h20 +# asm 1: movdqa h7=reg128#10 +# asm 2: movdqa h7=%xmm9 +movdqa %xmm6,%xmm9 + +# qhasm: h8 = h8 ^ h20 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h10 = h10 ^ h20 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h11 = h11 ^ h20 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h19 = mem128[ ptr + 400 ] +# asm 1: movdqu 400(h19=reg128#7 +# asm 2: movdqu 400(h19=%xmm6 +movdqu 400(%r8),%xmm6 + +# qhasm: h6 = h19 +# asm 1: movdqa h6=reg128#11 +# asm 2: movdqa h6=%xmm10 +movdqa %xmm6,%xmm10 + +# qhasm: h7 = h7 ^ h19 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm6,%xmm9,%xmm9 + +# qhasm: h9 = h9 ^ h19 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h10 = h10 ^ h19 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h18 = mem128[ ptr + 368 ] +# asm 1: movdqu 368(h18=reg128#7 +# asm 2: movdqu 368(h18=%xmm6 +movdqu 368(%r8),%xmm6 + +# qhasm: h18 = h18 ^ mem128[ ptr + 576 ] +# asm 1: vpxor 576(h18=reg128#7 +# asm 2: vpxor 576(h18=%xmm6 +vpxor 576(%r8),%xmm6,%xmm6 + +# qhasm: h5 = h18 +# asm 1: movdqa h5=reg128#12 +# asm 2: movdqa h5=%xmm11 +movdqa %xmm6,%xmm11 + +# qhasm: h6 = h6 ^ h18 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm6,%xmm10,%xmm10 + +# qhasm: h8 = h8 ^ h18 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h9 = h9 ^ h18 +# asm 1: vpxor h9=reg128#7 +# asm 2: vpxor h9=%xmm6 +vpxor %xmm6,%xmm7,%xmm6 + +# qhasm: h17 = mem128[ ptr + 336 ] +# asm 1: movdqu 336(h17=reg128#8 +# asm 2: movdqu 336(h17=%xmm7 +movdqu 336(%r8),%xmm7 + +# qhasm: h17 = h17 ^ mem128[ ptr + 544 ] +# asm 1: vpxor 544(h17=reg128#8 +# asm 2: vpxor 544(h17=%xmm7 +vpxor 544(%r8),%xmm7,%xmm7 + +# qhasm: h4 = h17 +# asm 1: movdqa h4=reg128#13 +# asm 2: movdqa h4=%xmm12 +movdqa %xmm7,%xmm12 + +# qhasm: h5 = h5 ^ h17 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm7,%xmm11,%xmm11 + +# qhasm: h7 = h7 ^ h17 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm7,%xmm9,%xmm9 + +# qhasm: h8 = h8 ^ h17 +# asm 1: vpxor h8=reg128#8 +# asm 2: vpxor h8=%xmm7 +vpxor %xmm7,%xmm8,%xmm7 + +# qhasm: h16 = mem128[ ptr + 304 ] +# asm 1: movdqu 304(h16=reg128#9 +# asm 2: movdqu 304(h16=%xmm8 +movdqu 304(%r8),%xmm8 + +# qhasm: h16 = h16 ^ mem128[ ptr + 512 ] +# asm 1: vpxor 512(h16=reg128#9 +# asm 2: vpxor 512(h16=%xmm8 +vpxor 512(%r8),%xmm8,%xmm8 + +# qhasm: h3 = h16 +# asm 1: movdqa h3=reg128#14 +# asm 2: movdqa h3=%xmm13 +movdqa %xmm8,%xmm13 + +# qhasm: h4 = h4 ^ h16 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm8,%xmm12,%xmm12 + +# qhasm: h6 = h6 ^ h16 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm8,%xmm10,%xmm10 + +# qhasm: h7 = h7 ^ h16 +# asm 1: vpxor h7=reg128#9 +# asm 2: vpxor h7=%xmm8 +vpxor %xmm8,%xmm9,%xmm8 + +# qhasm: h15 = h15 ^ mem128[ ptr + 272 ] +# asm 1: vpxor 272(h15=reg128#1 +# asm 2: vpxor 272(h15=%xmm0 +vpxor 272(%r8),%xmm0,%xmm0 + +# qhasm: h15 = h15 ^ mem128[ ptr + 480 ] +# asm 1: vpxor 480(h15=reg128#1 +# asm 2: vpxor 480(h15=%xmm0 +vpxor 480(%r8),%xmm0,%xmm0 + +# qhasm: h2 = h15 +# asm 1: movdqa h2=reg128#10 +# asm 2: movdqa h2=%xmm9 +movdqa %xmm0,%xmm9 + +# qhasm: h3 = h3 ^ h15 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm0,%xmm13,%xmm13 + +# qhasm: h5 = h5 ^ h15 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm0,%xmm11,%xmm11 + +# qhasm: h6 = h6 ^ h15 +# asm 1: vpxor h6=reg128#1 +# asm 2: vpxor h6=%xmm0 +vpxor %xmm0,%xmm10,%xmm0 + +# qhasm: h14 = h14 ^ mem128[ ptr + 240 ] +# asm 1: vpxor 240(h14=reg128#4 +# asm 2: vpxor 240(h14=%xmm3 +vpxor 240(%r8),%xmm3,%xmm3 + +# qhasm: h14 = h14 ^ mem128[ ptr + 448 ] +# asm 1: vpxor 448(h14=reg128#4 +# asm 2: vpxor 448(h14=%xmm3 +vpxor 448(%r8),%xmm3,%xmm3 + +# qhasm: h1 = h14 +# asm 1: movdqa h1=reg128#11 +# asm 2: movdqa h1=%xmm10 +movdqa %xmm3,%xmm10 + +# qhasm: h2 = h2 ^ h14 +# asm 1: vpxor h2=reg128#10 +# asm 2: vpxor h2=%xmm9 +vpxor %xmm3,%xmm9,%xmm9 + +# qhasm: h4 = h4 ^ h14 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm3,%xmm12,%xmm12 + +# qhasm: h5 = h5 ^ h14 +# asm 1: vpxor h5=reg128#4 +# asm 2: vpxor h5=%xmm3 +vpxor %xmm3,%xmm11,%xmm3 + +# qhasm: h13 = h13 ^ mem128[ ptr + 208 ] +# asm 1: vpxor 208(h13=reg128#5 +# asm 2: vpxor 208(h13=%xmm4 +vpxor 208(%r8),%xmm4,%xmm4 + +# qhasm: h13 = h13 ^ mem128[ ptr + 416 ] +# asm 1: vpxor 416(h13=reg128#5 +# asm 2: vpxor 416(h13=%xmm4 +vpxor 416(%r8),%xmm4,%xmm4 + +# qhasm: h0 = h13 +# asm 1: movdqa h0=reg128#12 +# asm 2: movdqa h0=%xmm11 +movdqa %xmm4,%xmm11 + +# qhasm: h1 = h1 ^ h13 +# asm 1: vpxor h1=reg128#11 +# asm 2: vpxor h1=%xmm10 +vpxor %xmm4,%xmm10,%xmm10 + +# qhasm: h3 = h3 ^ h13 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm4,%xmm13,%xmm13 + +# qhasm: h4 = h4 ^ h13 +# asm 1: vpxor h4=reg128#5 +# asm 2: vpxor h4=%xmm4 +vpxor %xmm4,%xmm12,%xmm4 + +# qhasm: h12 = h12 ^ mem128[ ptr + 384 ] +# asm 1: vpxor 384(h12=reg128#3 +# asm 2: vpxor 384(h12=%xmm2 +vpxor 384(%r8),%xmm2,%xmm2 + +# qhasm: h12 = h12 ^ mem128[ ptr + 176 ] +# asm 1: vpxor 176(h12=reg128#3 +# asm 2: vpxor 176(h12=%xmm2 +vpxor 176(%r8),%xmm2,%xmm2 + +# qhasm: mem128[ input_0 + 192 ] = h12 +# asm 1: movdqu h11=reg128#2 +# asm 2: vpxor 352(h11=%xmm1 +vpxor 352(%r8),%xmm1,%xmm1 + +# qhasm: h11 = h11 ^ mem128[ ptr + 144 ] +# asm 1: vpxor 144(h11=reg128#2 +# asm 2: vpxor 144(h11=%xmm1 +vpxor 144(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 176 ] = h11 +# asm 1: movdqu h10=reg128#2 +# asm 2: vpxor 320(h10=%xmm1 +vpxor 320(%r8),%xmm5,%xmm1 + +# qhasm: h10 = h10 ^ mem128[ ptr + 112 ] +# asm 1: vpxor 112(h10=reg128#2 +# asm 2: vpxor 112(h10=%xmm1 +vpxor 112(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 160 ] = h10 +# asm 1: movdqu h9=reg128#2 +# asm 2: vpxor 288(h9=%xmm1 +vpxor 288(%r8),%xmm6,%xmm1 + +# qhasm: h9 = h9 ^ mem128[ ptr + 80 ] +# asm 1: vpxor 80(h9=reg128#2 +# asm 2: vpxor 80(h9=%xmm1 +vpxor 80(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 144 ] = h9 +# asm 1: movdqu h8=reg128#2 +# asm 2: vpxor 256(h8=%xmm1 +vpxor 256(%r8),%xmm7,%xmm1 + +# qhasm: h8 = h8 ^ mem128[ ptr + 48 ] +# asm 1: vpxor 48(h8=reg128#2 +# asm 2: vpxor 48(h8=%xmm1 +vpxor 48(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 128 ] = h8 +# asm 1: movdqu h7=reg128#2 +# asm 2: vpxor 224(h7=%xmm1 +vpxor 224(%r8),%xmm8,%xmm1 + +# qhasm: h7 = h7 ^ mem128[ ptr + 16 ] +# asm 1: vpxor 16(h7=reg128#2 +# asm 2: vpxor 16(h7=%xmm1 +vpxor 16(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = h7 +# asm 1: movdqu h6=reg128#1 +# asm 2: vpxor 192(h6=%xmm0 +vpxor 192(%r8),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 96 ] = h6 +# asm 1: movdqu h5=reg128#1 +# asm 2: vpxor 160(h5=%xmm0 +vpxor 160(%r8),%xmm3,%xmm0 + +# qhasm: mem128[ input_0 + 80 ] = h5 +# asm 1: movdqu h4=reg128#1 +# asm 2: vpxor 128(h4=%xmm0 +vpxor 128(%r8),%xmm4,%xmm0 + +# qhasm: mem128[ input_0 + 64 ] = h4 +# asm 1: movdqu h3=reg128#1 +# asm 2: vpxor 96(h3=%xmm0 +vpxor 96(%r8),%xmm13,%xmm0 + +# qhasm: mem128[ input_0 + 48 ] = h3 +# asm 1: movdqu h2=reg128#1 +# asm 2: vpxor 64(h2=%xmm0 +vpxor 64(%r8),%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 32 ] = h2 +# asm 1: movdqu h1=reg128#1 +# asm 2: vpxor 32(h1=%xmm0 +vpxor 32(%r8),%xmm10,%xmm0 + +# qhasm: mem128[ input_0 + 16 ] = h1 +# asm 1: movdqu h0=reg128#1 +# asm 2: vpxor 0(h0=%xmm0 +vpxor 0(%r8),%xmm11,%xmm0 + +# qhasm: mem128[ input_0 + 0 ] = h0 +# asm 1: movdqu + +typedef __m256i vec256; + +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_set1_16b(uint16_t a); +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_setzero(void); +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_set4x(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); + +// Extract requires a literal argument so need to be macros +#define PQCLEAN_MCELIECE6960119_AVX_vec256_extract2x(a,i) ((vec128) _mm256_extractf128_si256((vec256) (a),(i))) +#define PQCLEAN_MCELIECE6960119_AVX_vec256_extract(a,i) ((uint64_t) _mm256_extract_epi64((vec256) (a),(i))) + +int PQCLEAN_MCELIECE6960119_AVX_vec256_testz(vec256 a); +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_and(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_xor(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_or(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_sll_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_srl_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_low(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_high(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_low_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_unpack_high_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119_AVX_vec256_or_reduce(const vec256 *a); +void PQCLEAN_MCELIECE6960119_AVX_vec256_copy(vec256 *dest, const vec256 *src); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6960119_AVX_vec256_mul(vec256 *h, vec256 *f, const vec256 *g); +void PQCLEAN_MCELIECE6960119_AVX_vec256_sq(vec256 * /*out*/, const vec256 * /*in*/); +void PQCLEAN_MCELIECE6960119_AVX_vec256_inv(vec256 * /*out*/, const vec256 * /*in*/); + +extern void PQCLEAN_MCELIECE6960119_AVX_vec256_maa_asm(vec256 *, vec256 *, const vec256 *); +extern void PQCLEAN_MCELIECE6960119_AVX_vec256_ama_asm(vec256 *, vec256 *, const vec256 *); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/vec256_ama_asm.S b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/vec256_ama_asm.S new file mode 100644 index 0000000000..966a953950 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119_avx/vec256_ama_asm.S @@ -0,0 +1,2322 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 a7 + +# qhasm: reg256 a8 + +# qhasm: reg256 a9 + +# qhasm: reg256 a10 + +# qhasm: reg256 a11 + +# qhasm: reg256 a12 + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: enter vec256_ama_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6960119_AVX_vec256_ama_asm +.global PQCLEAN_MCELIECE6960119_AVX_vec256_ama_asm +_PQCLEAN_MCELIECE6960119_AVX_vec256_ama_asm: +PQCLEAN_MCELIECE6960119_AVX_vec256_ama_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rdi),%ymm1 + +# qhasm: a12 = a12 ^ mem256[ input_1 + 384 ] +# asm 1: vpxor 384(a12=reg256#2 +# asm 2: vpxor 384(a12=%ymm1 +vpxor 384(%rsi),%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 384 ] = a12 +# asm 1: vmovupd r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rdi),%ymm14 + +# qhasm: a11 = a11 ^ mem256[ input_1 + 352 ] +# asm 1: vpxor 352(a11=reg256#15 +# asm 2: vpxor 352(a11=%ymm14 +vpxor 352(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 352 ] = a11 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rdi),%ymm14 + +# qhasm: a10 = a10 ^ mem256[ input_1 + 320 ] +# asm 1: vpxor 320(a10=reg256#15 +# asm 2: vpxor 320(a10=%ymm14 +vpxor 320(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 320 ] = a10 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rdi),%ymm14 + +# qhasm: a9 = a9 ^ mem256[ input_1 + 288 ] +# asm 1: vpxor 288(a9=reg256#15 +# asm 2: vpxor 288(a9=%ymm14 +vpxor 288(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 288 ] = a9 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rdi),%ymm14 + +# qhasm: a8 = a8 ^ mem256[ input_1 + 256 ] +# asm 1: vpxor 256(a8=reg256#15 +# asm 2: vpxor 256(a8=%ymm14 +vpxor 256(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 256 ] = a8 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rdi),%ymm14 + +# qhasm: a7 = a7 ^ mem256[ input_1 + 224 ] +# asm 1: vpxor 224(a7=reg256#15 +# asm 2: vpxor 224(a7=%ymm14 +vpxor 224(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 224 ] = a7 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rdi),%ymm14 + +# qhasm: a6 = a6 ^ mem256[ input_1 + 192 ] +# asm 1: vpxor 192(a6=reg256#15 +# asm 2: vpxor 192(a6=%ymm14 +vpxor 192(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 192 ] = a6 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rdi),%ymm14 + +# qhasm: a5 = a5 ^ mem256[ input_1 + 160 ] +# asm 1: vpxor 160(a5=reg256#15 +# asm 2: vpxor 160(a5=%ymm14 +vpxor 160(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 160 ] = a5 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rdi),%ymm14 + +# qhasm: a4 = a4 ^ mem256[ input_1 + 128 ] +# asm 1: vpxor 128(a4=reg256#15 +# asm 2: vpxor 128(a4=%ymm14 +vpxor 128(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 128 ] = a4 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rdi),%ymm14 + +# qhasm: a3 = a3 ^ mem256[ input_1 + 96 ] +# asm 1: vpxor 96(a3=reg256#15 +# asm 2: vpxor 96(a3=%ymm14 +vpxor 96(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 96 ] = a3 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rdi),%ymm14 + +# qhasm: a2 = a2 ^ mem256[ input_1 + 64 ] +# asm 1: vpxor 64(a2=reg256#15 +# asm 2: vpxor 64(a2=%ymm14 +vpxor 64(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 64 ] = a2 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rdi),%ymm14 + +# qhasm: a1 = a1 ^ mem256[ input_1 + 32 ] +# asm 1: vpxor 32(a1=reg256#15 +# asm 2: vpxor 32(a1=%ymm14 +vpxor 32(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 32 ] = a1 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rdi),%ymm14 + +# qhasm: a0 = a0 ^ mem256[ input_1 + 0 ] +# asm 1: vpxor 0(a0=reg256#15 +# asm 2: vpxor 0(a0=%ymm14 +vpxor 0(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 0 ] = a0 +# asm 1: vmovupd r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm2,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm1,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm13,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm12,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm11,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm10,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm9,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm8,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm7,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm6,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm5,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm4,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm3,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rdi),%ymm2,%ymm0 + +# qhasm: mem256[ input_0 + 384 ] = r12 +# asm 1: vmovupd r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rdi),%ymm1,%ymm0 + +# qhasm: mem256[ input_0 + 352 ] = r11 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rdi),%ymm13,%ymm0 + +# qhasm: mem256[ input_0 + 320 ] = r10 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rdi),%ymm12,%ymm0 + +# qhasm: mem256[ input_0 + 288 ] = r9 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rdi),%ymm11,%ymm0 + +# qhasm: mem256[ input_0 + 256 ] = r8 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rdi),%ymm10,%ymm0 + +# qhasm: mem256[ input_0 + 224 ] = r7 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rdi),%ymm9,%ymm0 + +# qhasm: mem256[ input_0 + 192 ] = r6 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rdi),%ymm8,%ymm0 + +# qhasm: mem256[ input_0 + 160 ] = r5 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rdi),%ymm7,%ymm0 + +# qhasm: mem256[ input_0 + 128 ] = r4 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rdi),%ymm6,%ymm0 + +# qhasm: mem256[ input_0 + 96 ] = r3 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rdi),%ymm5,%ymm0 + +# qhasm: mem256[ input_0 + 64 ] = r2 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rdi),%ymm4,%ymm0 + +# qhasm: mem256[ input_0 + 32 ] = r1 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rdi),%ymm3,%ymm0 + +# qhasm: mem256[ input_0 + 0 ] = r0 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t0 = mem64[ input_0 + 192 ] +# asm 1: movq 192(t0=int64#2 +# asm 2: movq 192(t0=%rsi +movq 192(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(t1=int64#3 +# asm 2: movq 200(t1=%rdx +movq 200(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 176(t0=%rsi +movq 176(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(t1=int64#3 +# asm 2: movq 184(t1=%rdx +movq 184(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 160(t0=%rsi +movq 160(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(t1=int64#3 +# asm 2: movq 168(t1=%rdx +movq 168(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 144(t0=%rsi +movq 144(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(t1=int64#3 +# asm 2: movq 152(t1=%rdx +movq 152(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 128(t0=%rsi +movq 128(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(t1=int64#3 +# asm 2: movq 136(t1=%rdx +movq 136(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 112(t0=%rsi +movq 112(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(t1=int64#3 +# asm 2: movq 120(t1=%rdx +movq 120(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 96(t0=%rsi +movq 96(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(t1=int64#3 +# asm 2: movq 104(t1=%rdx +movq 104(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 80(t0=%rsi +movq 80(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(t1=int64#3 +# asm 2: movq 88(t1=%rdx +movq 88(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 64(t0=%rsi +movq 64(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(t1=int64#3 +# asm 2: movq 72(t1=%rdx +movq 72(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 48(t0=%rsi +movq 48(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(t1=int64#3 +# asm 2: movq 56(t1=%rdx +movq 56(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 32(t0=%rsi +movq 32(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(t1=int64#3 +# asm 2: movq 40(t1=%rdx +movq 40(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 16(t0=%rsi +movq 16(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(t1=int64#3 +# asm 2: movq 24(t1=%rdx +movq 24(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 0(t0=%rsi +movq 0(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(t1=int64#1 +# asm 2: movq 8(t1=%rdi +movq 8(%rdi),%rdi + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rsi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE6960119F_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/api.h b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/api.h new file mode 100644 index 0000000000..3ea68e54b4 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/api.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_API_H +#define PQCLEAN_MCELIECE6960119F_AVX_API_H + +#include + +#define PQCLEAN_MCELIECE6960119F_AVX_CRYPTO_ALGNAME "Classic McEliece 6960119f" +#define PQCLEAN_MCELIECE6960119F_AVX_CRYPTO_PUBLICKEYBYTES 1047319 +#define PQCLEAN_MCELIECE6960119F_AVX_CRYPTO_SECRETKEYBYTES 13908 +#define PQCLEAN_MCELIECE6960119F_AVX_CRYPTO_CIPHERTEXTBYTES 226 +#define PQCLEAN_MCELIECE6960119F_AVX_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE6960119F_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE6960119F_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE6960119F_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/benes.c b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/benes.c new file mode 100644 index 0000000000..a1c9e78cc9 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/benes.c @@ -0,0 +1,311 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_x(vec128 *data, vec128 *bits) { + int i; + vec128 v0, v1; + vec128 d; + + for (i = 0; i < 64; i += 2) { + v0 = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_low(data[i + 0], data[i + 1]); + v1 = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_high(data[i + 0], data[i + 1]); + + d = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(v0, v1); + d = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(d, *bits++); + v0 = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(v0, d); + v1 = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(v1, d); + + data[i + 0] = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_low(v0, v1); + data[i + 1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_high(v0, v1); + } +} + +static void layer_0(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x ], bs[ x + 1 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, *cond++); + bs[ x ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x ], diff); + bs[ x + 1 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 1 ], diff); + } +} + +static void layer_1(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 2 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 2 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 3 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 3 ], diff); + + cond += 2; + } +} + +static void layer_2(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 4 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 4 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 5 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 5 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[2]); + bs[ x + 2 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 2 ], diff); + bs[ x + 6 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 6 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[3]); + bs[ x + 3 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 3 ], diff); + bs[ x + 7 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ x + 7 ], diff); + + cond += 4; + } +} + +static void layer_3(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 8 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 8 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 9 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 9 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 10 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 10 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 11 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 11 ], diff); + + cond += 4; + } + } +} + +static void layer_4(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 16 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 16 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 17 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 17 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 18 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 18 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 19 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 19 ], diff); + + cond += 4; + } + } +} + +static void layer_5(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 32 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 32 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 33 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 33 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 34 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 34 ], diff); + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); + diff = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 35 ] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(bs[ s + 35 ], diff); + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: bits_int, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE6960119F_AVX_load_bits(vec128 bits_int[][32], const unsigned char *bits) { + int i, j; + const unsigned char *ptr = bits; + + vec128 buf[64]; + + for (i = 0; i <= 5; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(PQCLEAN_MCELIECE6960119F_AVX_load8(ptr), PQCLEAN_MCELIECE6960119F_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE6960119F_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } + + for (i = 6; i <= 18; i++) { + for (j = 0; j < 32; j++) { + bits_int[i][j] = PQCLEAN_MCELIECE6960119F_AVX_load16(ptr); + ptr += 16; + } + } + + for (i = 19; i < 25; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(PQCLEAN_MCELIECE6960119F_AVX_load8(ptr), PQCLEAN_MCELIECE6960119F_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE6960119F_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } +} + +/* input: r, sequence of bits to be permuted */ +/* b, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE6960119F_AVX_benes(vec128 *r, vec128 b[][32], int rev) { + int inc; + + vec128 *b_ptr; + + if (rev == 0) { + inc = 32; + b_ptr = b[ 0]; + } else { + inc = -32; + b_ptr = b[24]; + } + + // + + PQCLEAN_MCELIECE6960119F_AVX_transpose_64x128_sp( r ); + + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE6960119F_AVX_transpose_64x128_sp( r ); + + layer_x(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_x(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE6960119F_AVX_transpose_64x128_sp( r ); + + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + //b_ptr += inc; + + PQCLEAN_MCELIECE6960119F_AVX_transpose_64x128_sp( r ); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/benes.h b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/benes.h new file mode 100644 index 0000000000..5dd0156ba3 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_BENES_H +#define PQCLEAN_MCELIECE6960119F_AVX_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE6960119F_AVX_load_bits(vec128 /*bits_int*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE6960119F_AVX_benes(vec128 * /*r*/, vec128 /*b*/[][32], int /*rev*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/bm.c b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/bm.c new file mode 100644 index 0000000000..a5d56e0906 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/bm.c @@ -0,0 +1,215 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "params.h" +#include "vec128.h" + +#include + +extern gf PQCLEAN_MCELIECE6960119F_AVX_vec_reduce_asm(vec128 *); +extern void PQCLEAN_MCELIECE6960119F_AVX_update_asm(void *, gf, int); + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline void vec128_cmov(vec128 out[][2], uint16_t mask) { + int i; + + vec128 v0, v1; + + vec128 m0 = PQCLEAN_MCELIECE6960119F_AVX_vec128_set1_16b( mask); + vec128 m1 = PQCLEAN_MCELIECE6960119F_AVX_vec128_set1_16b(~mask); + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(out[i][1], m0); + v1 = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(out[i][0], m1); + out[i][0] = PQCLEAN_MCELIECE6960119F_AVX_vec128_or(v0, v1); + } +} + +static inline void interleave(vec256 *in, int idx0, int idx1, vec256 *mask, int b) { + int s = 1 << b; + + vec256 x, y; + + x = PQCLEAN_MCELIECE6960119F_AVX_vec256_or(PQCLEAN_MCELIECE6960119F_AVX_vec256_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE6960119F_AVX_vec256_sll_4x(PQCLEAN_MCELIECE6960119F_AVX_vec256_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE6960119F_AVX_vec256_or(PQCLEAN_MCELIECE6960119F_AVX_vec256_srl_4x(PQCLEAN_MCELIECE6960119F_AVX_vec256_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE6960119F_AVX_vec256_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec256 *in) { + int i, k; + + vec256 mask[4][2]; + vec256 buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE6960119F_AVX_vec256_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(buf[i], 1) >> (k * 16)) & GFMASK; + out[ (4 * 2 + k) * 16 + i ] = (PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(buf[i], 2) >> (k * 16)) & GFMASK; + out[ (4 * 3 + k) * 16 + i ] = (PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(buf[i], 3) >> (k * 16)) & GFMASK; + } + } +} + +typedef union { + vec128 as_128[GFBITS][2]; + vec256 as_256[GFBITS]; +} aligned_double_vec128; + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE6960119F_AVX_bm(vec128 *out, vec256 *in) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1; + uint64_t v[2]; + + vec128 prod[ GFBITS ]; + vec128 interval[GFBITS]; + + aligned_double_vec128 db; + aligned_double_vec128 BC_tmp; + aligned_double_vec128 BC; + + gf d, b; + gf coefs[256]; + + // initialization + + get_coefs(coefs, in); + + BC.as_128[0][0] = PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0, one << 62); + BC.as_128[0][1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0, one << 63); + + for (i = 1; i < GFBITS; i++) { + BC.as_128[i][0] = BC.as_128[i][1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_setzero(); + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[i] = PQCLEAN_MCELIECE6960119F_AVX_vec128_setzero(); + } + + for (N = 0; N < SYS_T * 2; N++) { + PQCLEAN_MCELIECE6960119F_AVX_update_asm(interval, coefs[N], 16); + PQCLEAN_MCELIECE6960119F_AVX_vec128_mul_asm(prod, interval, BC.as_128[0] + 1, 32); + + d = PQCLEAN_MCELIECE6960119F_AVX_vec_reduce_asm(prod); + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + db.as_128[i][0] = PQCLEAN_MCELIECE6960119F_AVX_vec128_setbits((d >> i) & 1); + db.as_128[i][1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(BC_tmp.as_256, db.as_256, BC.as_256); + + vec128_cmov(BC.as_128, mask); + PQCLEAN_MCELIECE6960119F_AVX_update_asm(BC.as_128, 0, 32); + + for (i = 0; i < GFBITS; i++) { + BC.as_128[i][1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(BC_tmp.as_128[i][0], BC_tmp.as_128[i][1]); + } + + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(BC.as_128[i][1], 0); + v[1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(BC.as_128[i][1], 1); + + out[i] = PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x((v[0] >> 8) | (v[1] << 56), v[1] >> 8); + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/bm.h b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/bm.h new file mode 100644 index 0000000000..6246a3399a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/bm.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_BM_H +#define PQCLEAN_MCELIECE6960119F_AVX_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "vec128.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE6960119F_AVX_bm(vec128 * /*out*/, vec256 * /*in*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/consts.S b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/consts.S new file mode 100644 index 0000000000..a3096c1f0b --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/consts.S @@ -0,0 +1,33 @@ +.data + +# not supported on MacOS +#.section .rodata + +.globl PQCLEAN_MCELIECE6960119F_AVX_MASK0_0 +.globl PQCLEAN_MCELIECE6960119F_AVX_MASK0_1 +.globl PQCLEAN_MCELIECE6960119F_AVX_MASK1_0 +.globl PQCLEAN_MCELIECE6960119F_AVX_MASK1_1 +.globl PQCLEAN_MCELIECE6960119F_AVX_MASK2_0 +.globl PQCLEAN_MCELIECE6960119F_AVX_MASK2_1 +.globl PQCLEAN_MCELIECE6960119F_AVX_MASK3_0 +.globl PQCLEAN_MCELIECE6960119F_AVX_MASK3_1 +.globl PQCLEAN_MCELIECE6960119F_AVX_MASK4_0 +.globl PQCLEAN_MCELIECE6960119F_AVX_MASK4_1 +.globl PQCLEAN_MCELIECE6960119F_AVX_MASK5_0 +.globl PQCLEAN_MCELIECE6960119F_AVX_MASK5_1 + +.p2align 5 + +PQCLEAN_MCELIECE6960119F_AVX_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE6960119F_AVX_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE6960119F_AVX_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333, 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE6960119F_AVX_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE6960119F_AVX_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE6960119F_AVX_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE6960119F_AVX_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE6960119F_AVX_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE6960119F_AVX_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE6960119F_AVX_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE6960119F_AVX_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE6960119F_AVX_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/consts.inc b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/consts.inc new file mode 100644 index 0000000000..7f87f10e90 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/consts.inc @@ -0,0 +1,502 @@ +//64 +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6666666666666666, 0X6666666666666666, 0X6666666666666666, 0X6666666666666666), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A) +}, +//128 +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33, 0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF) +}, +//256 +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0XF00F0FF00FF0F00F, 0XF00F0FF00FF0F00F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9669966969966996, 0X6996699696699669, 0X6996699696699669, 0X9669966969966996), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0X5AA5A55AA55A5AA5, 0X5AA5A55AA55A5AA5, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555) +}, +//512 +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9966669966999966, 0X6699996699666699, 0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9966996699669966, 0X6699669966996699, 0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6699996699666699, 0X9966669966999966, 0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6699669966996699, 0X9966996699669966, 0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +//1024 +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +//2048 +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +//4096 +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/controlbits.c b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/controlbits.c new file mode 100644 index 0000000000..1982a804a4 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE6960119F_AVX_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE6960119F_AVX_sort_63b(n / 2, x); + PQCLEAN_MCELIECE6960119F_AVX_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE6960119F_AVX_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/controlbits.h b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/controlbits.h new file mode 100644 index 0000000000..ef3b57e5d1 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_CONTROLBITS_H +#define PQCLEAN_MCELIECE6960119F_AVX_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE6960119F_AVX_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE6960119F_AVX_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/crypto_hash.h b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/crypto_hash.h new file mode 100644 index 0000000000..3676f10a27 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE6960119F_AVX_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/decrypt.c b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/decrypt.c new file mode 100644 index 0000000000..493367f407 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/decrypt.c @@ -0,0 +1,236 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec256 out[][GFBITS], vec256 inv[][GFBITS], const unsigned char *sk, vec256 *recv) { + int i, j; + + vec128 sk_int[ GFBITS ]; + vec256 eval[32][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + // computing inverses + + PQCLEAN_MCELIECE6960119F_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE6960119F_AVX_fft(eval, sk_int); + + for (i = 0; i < 32; i++) { + PQCLEAN_MCELIECE6960119F_AVX_vec256_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE6960119F_AVX_vec256_copy(inv[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6960119F_AVX_vec256_inv(tmp, inv[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6960119F_AVX_vec256_copy(inv[0], tmp); + + // + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE6960119F_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + unsigned char r[ 1024 ]; + + for (i = 0; i < SYND_BYTES; i++) { + r[i] = s[i]; + } + + r[i - 1] &= (1 << ((GFBITS * SYS_T) % 8)) - 1; // throwing away redundant bits + + for (i = SYND_BYTES; i < 1024; i++) { + r[i] = 0; + } + + for (i = 0; i < 64; i++) { + recv[i] = PQCLEAN_MCELIECE6960119F_AVX_load16(r + i * 16); + } +} + +static void postprocess(unsigned char *e, vec128 *err) { + int i; + unsigned char error8[ (1 << GFBITS) / 8 ]; + uint64_t v[2]; + + for (i = 0; i < 64; i++) { + v[0] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(err[i], 0); + v[1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(err[i], 1); + + PQCLEAN_MCELIECE6960119F_AVX_store8(error8 + i * 16 + 0, v[0]); + PQCLEAN_MCELIECE6960119F_AVX_store8(error8 + i * 16 + 8, v[1]); + } + + for (i = 0; i < SYS_N / 8; i++) { + e[i] = error8[i]; + } +} + +static void scaling_inv(vec256 out[][GFBITS], vec256 inv[][GFBITS], vec256 *recv) { + int i, j; + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE6960119F_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static uint16_t weight_check(unsigned char *e, vec128 *error) { + int i; + uint16_t w0 = 0; + uint16_t w1 = 0; + uint16_t check; + + for (i = 0; i < 64; i++) { + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(error[i], 0) ); + w0 += _mm_popcnt_u64( PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(error[i], 1) ); + } + + for (i = 0; i < SYS_N / 8; i++) { + w1 += _mm_popcnt_u64( e[i] ); + } + + check = (w0 ^ SYS_T) | (w1 ^ SYS_T); + check -= 1; + check >>= 15; + + return check; +} + +static uint16_t synd_cmp(vec256 *s0, vec256 *s1) { + int i; + vec256 diff; + + diff = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(s0[0], s1[0]); + + for (i = 1; i < GFBITS; i++) { + diff = PQCLEAN_MCELIECE6960119F_AVX_vec256_or(diff, PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(s0[i], s1[i])); + } + + return (uint16_t)PQCLEAN_MCELIECE6960119F_AVX_vec256_testz(diff); +} + +static void reformat_128to256(vec256 *out, vec128 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(in[2 * i + 0], 0); + v[1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(in[2 * i + 0], 1); + v[2] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(in[2 * i + 1], 0); + v[3] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(in[2 * i + 1], 1); + + out[i] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +static void reformat_256to128(vec128 *out, vec256 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 3); + + out[2 * i + 0] = PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(v[0], v[1]); + out[2 * i + 1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(v[2], v[3]); + } +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE6960119F_AVX_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec256 inv[ 32 ][ GFBITS ]; + vec256 scaled[ 32 ][ GFBITS ]; + vec256 eval[32][ GFBITS ]; + + vec128 error128[ 64 ]; + vec256 error256[ 32 ]; + + vec256 s_priv[ GFBITS ]; + vec256 s_priv_cmp[ GFBITS ]; + vec128 locator[ GFBITS ]; + + vec128 recv128[ 64 ]; + vec256 recv256[ 32 ]; + vec256 allone; + + vec128 bits_int[25][32]; + + // Berlekamp decoder + + preprocess(recv128, c); + + PQCLEAN_MCELIECE6960119F_AVX_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE6960119F_AVX_benes(recv128, bits_int, 1); + + reformat_128to256(recv256, recv128); + + scaling(scaled, inv, sk, recv256); + PQCLEAN_MCELIECE6960119F_AVX_fft_tr(s_priv, scaled); + PQCLEAN_MCELIECE6960119F_AVX_bm(locator, s_priv); + + PQCLEAN_MCELIECE6960119F_AVX_fft(eval, locator); + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE6960119F_AVX_vec256_set1_16b(0xFFFF); + + for (i = 0; i < 32; i++) { + error256[i] = PQCLEAN_MCELIECE6960119F_AVX_vec256_or_reduce(eval[i]); + error256[i] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(error256[i], allone); + } + + scaling_inv(scaled, inv, error256); + PQCLEAN_MCELIECE6960119F_AVX_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + reformat_256to128(error128, error256); + PQCLEAN_MCELIECE6960119F_AVX_benes(error128, bits_int, 0); + + postprocess(e, error128); + + check_weight = weight_check(e, error128); + + return 1 - (check_synd & check_weight); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/decrypt.h b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/decrypt.h new file mode 100644 index 0000000000..7aed1ae2a1 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_DECRYPT_H +#define PQCLEAN_MCELIECE6960119F_AVX_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE6960119F_AVX_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/encrypt.c b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/encrypt.c new file mode 100644 index 0000000000..e830305a4e --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/encrypt.c @@ -0,0 +1,104 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "int32_sort.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include +#include +#include +#include + +#include "gf.h" + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE6960119F_AVX_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + int i, j, eq, count; + + uint16_t ind[ SYS_T * 2 ]; + int32_t ind32[ SYS_T * 2 ]; + uint64_t e_int[ (SYS_N + 63) / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((unsigned char *) ind, sizeof(ind)); + + for (i = 0; i < SYS_T * 2; i++) { + ind[i] &= GFMASK; + } + + // moving and counting indices in the correct range + + count = 0; + for (i = 0; i < SYS_T * 2; i++) { + if (ind[i] < SYS_N) { + ind32[ count++ ] = ind[i]; + } + } + + if (count < SYS_T) { + continue; + } + + // check for repetition + + PQCLEAN_MCELIECE6960119F_AVX_int32_sort(ind32, SYS_T); + + eq = 0; + for (i = 1; i < SYS_T; i++) { + if (ind32[i - 1] == ind32[i]) { + eq = 1; + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind32[j] & 63); + } + + for (i = 0; i < (SYS_N + 63) / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind32[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < (SYS_N + 63) / 64 - 1; i++) { + PQCLEAN_MCELIECE6960119F_AVX_store8(e, e_int[i]); + e += 8; + } + + for (j = 0; j < (SYS_N % 64); j += 8) { + e[ j / 8 ] = (e_int[i] >> j) & 0xFF; + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE6960119F_AVX_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + + PQCLEAN_MCELIECE6960119F_AVX_syndrome_asm(s, pk, e); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/encrypt.h b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/encrypt.h new file mode 100644 index 0000000000..bacb1b370e --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_ENCRYPT_H +#define PQCLEAN_MCELIECE6960119F_AVX_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE6960119F_AVX_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/fft.c b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/fft.c new file mode 100644 index 0000000000..72d02d4bc5 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/fft.c @@ -0,0 +1,262 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" +#include "vec256.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec128 *in) { + int i, j, k; + vec128 t; + uint64_t v0, v1; + + const vec128 mask[5][2] = { + { + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0x8888888888888888, 0x8888888888888888), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + } + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + v1 = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(in[i], 1); + v1 ^= v1 >> 32; + v0 = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(in[i], 0); + v0 ^= v1 << 32; + in[i] = PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(v0, v1); + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + t = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE6960119F_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(in[i], t); + + t = PQCLEAN_MCELIECE6960119F_AVX_vec128_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE6960119F_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(in[i], t); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE6960119F_AVX_vec128_mul(in, in, s[j]); // scaling + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec256 out[][ GFBITS ], vec128 *in) { + int i, j, k, s, b; + + vec128 tmp[ GFBITS ]; + vec256 tmp0[ GFBITS ]; + vec256 tmp1[ GFBITS ]; + vec128 t[ GFBITS ]; + + union { + vec128 v[8][ GFBITS + 1 ]; + vec256 V[8][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + uint64_t v0, v1; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t consts_ptr = 2; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; + + // boradcast + + for (j = 0; j < GFBITS; j++) { + t[j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_high(in[j], in[j]); + } + + for (i = 0; i < 8; i += 2) { + for (j = 0; j < GFBITS; j++) { + v0 = (beta[i + 0] >> j) & 1; + v0 = -v0; + v1 = (beta[i + 1] >> j) & 1; + v1 = -v1; + + tmp[j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(v0, v1); + } + + PQCLEAN_MCELIECE6960119F_AVX_vec128_mul(tmp, t, tmp); + + for (j = 0; j < GFBITS; j++) { + pre.v[i + 0][j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_low(tmp[j], tmp[j]); + pre.v[i + 1][j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_high(tmp[j], tmp[j]); + } + } + + for (i = 0; i < GFBITS; i += 2) { + if (i != GFBITS - 1) { + buf.v[0][1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_low(in[i + 1], in[i + 1] ^ pre.v[6][i + 1]); + } + buf.v[0][0] = PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_low(in[i + 0], in[i + 0] ^ pre.v[6][i + 0]); + + buf.V[1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[0], pre.V[0][i / 2]); + buf.V[16] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[0], pre.V[4][i / 2]); + buf.V[3] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[1], pre.V[1][i / 2]); + buf.V[48] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[16], pre.V[5][i / 2]); + buf.V[49] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[48], pre.V[0][i / 2]); + buf.V[2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[0], pre.V[1][i / 2]); + buf.V[51] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[49], pre.V[1][i / 2]); + buf.V[6] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[2], pre.V[2][i / 2]); + buf.V[50] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[51], pre.V[0][i / 2]); + buf.V[7] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[6], pre.V[0][i / 2]); + buf.V[54] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[50], pre.V[2][i / 2]); + buf.V[5] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[7], pre.V[1][i / 2]); + buf.V[55] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[54], pre.V[0][i / 2]); + buf.V[53] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[55], pre.V[1][i / 2]); + buf.V[4] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[0], pre.V[2][i / 2]); + buf.V[52] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[53], pre.V[0][i / 2]); + buf.V[12] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[4], pre.V[3][i / 2]); + buf.V[60] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[52], pre.V[3][i / 2]); + buf.V[13] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[12], pre.V[0][i / 2]); + buf.V[61] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[60], pre.V[0][i / 2]); + buf.V[15] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[13], pre.V[1][i / 2]); + buf.V[63] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[61], pre.V[1][i / 2]); + buf.V[14] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[15], pre.V[0][i / 2]); + buf.V[62] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[63], pre.V[0][i / 2]); + buf.V[10] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[14], pre.V[2][i / 2]); + buf.V[58] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[62], pre.V[2][i / 2]); + buf.V[11] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[10], pre.V[0][i / 2]); + buf.V[59] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[58], pre.V[0][i / 2]); + buf.V[9] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[11], pre.V[1][i / 2]); + buf.V[57] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[59], pre.V[1][i / 2]); + buf.V[56] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[57], pre.V[0][i / 2]); + buf.V[8] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[0], pre.V[3][i / 2]); + buf.V[40] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[56], pre.V[4][i / 2]); + buf.V[24] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[8], pre.V[4][i / 2]); + buf.V[41] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[40], pre.V[0][i / 2]); + buf.V[25] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[24], pre.V[0][i / 2]); + buf.V[43] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[41], pre.V[1][i / 2]); + buf.V[27] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[25], pre.V[1][i / 2]); + buf.V[42] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[43], pre.V[0][i / 2]); + buf.V[26] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[27], pre.V[0][i / 2]); + buf.V[46] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[42], pre.V[2][i / 2]); + buf.V[30] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[26], pre.V[2][i / 2]); + buf.V[47] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[46], pre.V[0][i / 2]); + buf.V[31] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[30], pre.V[0][i / 2]); + buf.V[45] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[47], pre.V[1][i / 2]); + buf.V[29] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[31], pre.V[1][i / 2]); + buf.V[44] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[45], pre.V[0][i / 2]); + buf.V[28] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[29], pre.V[0][i / 2]); + buf.V[36] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[44], pre.V[3][i / 2]); + buf.V[20] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[28], pre.V[3][i / 2]); + buf.V[37] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[36], pre.V[0][i / 2]); + buf.V[21] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[20], pre.V[0][i / 2]); + buf.V[39] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[37], pre.V[1][i / 2]); + buf.V[23] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[21], pre.V[1][i / 2]); + buf.V[38] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[39], pre.V[0][i / 2]); + buf.V[22] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[23], pre.V[0][i / 2]); + buf.V[34] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[38], pre.V[2][i / 2]); + buf.V[18] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[22], pre.V[2][i / 2]); + buf.V[35] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[34], pre.V[0][i / 2]); + buf.V[19] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[18], pre.V[0][i / 2]); + buf.V[33] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[35], pre.V[1][i / 2]); + buf.V[17] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[19], pre.V[1][i / 2]); + buf.V[32] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[33], pre.V[0][i / 2]); + + // transpose + + PQCLEAN_MCELIECE6960119F_AVX_transpose_64x256_sp(buf.V); + + for (j = 0; j < 32; j++) { + if (i != GFBITS - 1) { + out[j][i + 1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_high(buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + out[j][i + 0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_low (buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + } + + // butterflies + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + tmp0[b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_low (out[k][b], out[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + tmp1[b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_high (out[k][b], out[k + 1][b]); + } + + PQCLEAN_MCELIECE6960119F_AVX_vec256_maa_asm(tmp0, tmp1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_low (tmp0[b], tmp1[b]); + } + for (b = 0; b < GFBITS; b++) { + out[k + 1][b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_high (tmp0[b], tmp1[b]); + } + } + + for (i = 0; i <= 4; i++) { + s = 1 << i; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE6960119F_AVX_vec256_maa_asm(out[k], out[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + consts_ptr += (1 << i); + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE6960119F_AVX_fft(vec256 out[][GFBITS], vec128 *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/fft.h b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/fft.h new file mode 100644 index 0000000000..21ef0e9a3d --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/fft.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_FFT_H +#define PQCLEAN_MCELIECE6960119F_AVX_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include "params.h" +#include "vec128.h" +#include "vec256.h" +#include + +void PQCLEAN_MCELIECE6960119F_AVX_fft(vec256 /*out*/[][GFBITS], vec128 * /*in*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/fft_tr.c b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/fft_tr.c new file mode 100644 index 0000000000..44d483850c --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/fft_tr.c @@ -0,0 +1,400 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec256 *in) { + int i, j, k; + vec256 t; + uint64_t v[4]; + + const vec256 mask[6][2] = { + { + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x2222222222222222, 0x2222222222222222, 0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x4444444444444444, 0x4444444444444444, 0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x3030303030303030, 0x3030303030303030, 0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000) + }, + { + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF) + } + }; + + const vec256 s[6][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(in, in, s[j]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + t = PQCLEAN_MCELIECE6960119F_AVX_vec256_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE6960119F_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(in[i], t); + + t = PQCLEAN_MCELIECE6960119F_AVX_vec256_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE6960119F_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(in[i], t); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 3); + + v[1] ^= v[0] >> 32; + v[1] ^= v[1] << 32; + v[3] ^= v[2] >> 32; + v[3] ^= v[3] << 32; + + in[i] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i], 3); + + v[2] ^= v[1]; + v[3] ^= v[2]; + + in[i] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + + } +} + +static void butterflies_tr(vec256 *out, vec256 in[][ GFBITS ]) { + int i, j, k, s, b; + + vec256 t0[ GFBITS ]; + vec256 t1[ GFBITS ]; + vec256 t; + + vec128 out128[ GFBITS ][ 2 ]; + vec128 tmp[ GFBITS ]; + + union { + vec128 v[6][ GFBITS + 1 ]; + vec256 V[6][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t v[4]; + uint64_t consts_ptr = 33; + + const unsigned char reversal[] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // butterflies + + for (i = 4; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE6960119F_AVX_vec256_ama_asm(in[k], in[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + } + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_low(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_high(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE6960119F_AVX_vec256_ama_asm(t0, t1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + in[k][b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_low(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_high(t0[b], t1[b]); + } + + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_low_2x(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_high_2x(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE6960119F_AVX_vec256_ama_asm(t0, t1, consts[0]); + + for (b = 0; b < GFBITS; b++) { + in[k + 0][b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_low_2x(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_high_2x(t0[b], t1[b]); + } + } + + + // boradcast + + for (i = 0; i < GFBITS; i += 2) { + // transpose + + for (k = 0; k < 32; k++) { + if (i != GFBITS - 1) { + buf.v[ reversal[2 * k + 0] ][1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract2x(in[ k ][i + 1], 0); + buf.v[ reversal[2 * k + 1] ][1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract2x(in[ k ][i + 1], 1); + } + + buf.v[ reversal[2 * k + 0] ][0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract2x(in[ k ][i + 0], 0); + buf.v[ reversal[2 * k + 1] ][0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract2x(in[ k ][i + 0], 1); + } + + PQCLEAN_MCELIECE6960119F_AVX_transpose_64x256_sp(buf.V); + + // + + pre.V[0][i / 2] = buf.V[32]; + buf.V[33] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[33], buf.V[32]); + pre.V[1][i / 2] = buf.V[33]; + buf.V[35] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[35], buf.V[33]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[35]); + buf.V[34] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[34], buf.V[35]); + pre.V[2][i / 2] = buf.V[34]; + buf.V[38] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[38], buf.V[34]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[38]); + buf.V[39] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[39], buf.V[38]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[39]); + buf.V[37] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[37], buf.V[39]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[37]); + buf.V[36] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[36], buf.V[37]); + pre.V[3][i / 2] = buf.V[36]; + buf.V[44] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[44], buf.V[36]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[44]); + buf.V[45] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[45], buf.V[44]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[45]); + buf.V[47] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[47], buf.V[45]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[47]); + buf.V[46] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[46], buf.V[47]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[46]); + buf.V[42] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[42], buf.V[46]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[42]); + buf.V[43] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[43], buf.V[42]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[43]); + buf.V[41] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[41], buf.V[43]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[41]); + buf.V[40] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[40], buf.V[41]); + pre.V[4][i / 2] = buf.V[40]; + buf.V[56] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[56], buf.V[40]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[56]); + buf.V[57] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[57], buf.V[56]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[57]); + buf.V[59] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[59], buf.V[57]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[59]); + buf.V[58] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[58], buf.V[59]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[58]); + buf.V[62] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[62], buf.V[58]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[62]); + buf.V[63] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[63], buf.V[62]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[63]); + buf.V[61] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[61], buf.V[63]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[61]); + buf.V[60] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[60], buf.V[61]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[60]); + buf.V[52] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[52], buf.V[60]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[52]); + buf.V[53] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[53], buf.V[52]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[53]); + buf.V[55] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[55], buf.V[53]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[55]); + buf.V[54] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[54], buf.V[55]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[54]); + buf.V[50] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[50], buf.V[54]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[50]); + buf.V[51] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[51], buf.V[50]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[51]); + buf.V[49] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[49], buf.V[51]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[49]); + buf.V[48] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[48], buf.V[49]); + pre.V[5][i / 2] = buf.V[48]; + buf.V[16] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[16], buf.V[48]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[16]); + buf.V[17] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[17], buf.V[16]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[17]); + buf.V[19] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[19], buf.V[17]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[19]); + buf.V[18] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[18], buf.V[19]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[18]); + buf.V[22] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[22], buf.V[18]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[22]); + buf.V[23] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[23], buf.V[22]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[23]); + buf.V[21] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[21], buf.V[23]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[21]); + buf.V[20] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[20], buf.V[21]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[20]); + buf.V[28] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[28], buf.V[20]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[28]); + buf.V[29] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[29], buf.V[28]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[29]); + buf.V[31] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[31], buf.V[29]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[31]); + buf.V[30] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[30], buf.V[31]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[30]); + buf.V[26] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[26], buf.V[30]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[26]); + buf.V[27] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[27], buf.V[26]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[27]); + buf.V[25] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[25], buf.V[27]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[25]); + buf.V[24] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[24], buf.V[25]); + pre.V[4][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[4][i / 2], buf.V[24]); + buf.V[8] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[8], buf.V[24]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[8]); + buf.V[9] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[9], buf.V[8]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[9]); + buf.V[11] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[11], buf.V[9]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[11]); + buf.V[10] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[10], buf.V[11]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[10]); + buf.V[14] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[14], buf.V[10]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[14]); + buf.V[15] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[15], buf.V[14]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[15]); + buf.V[13] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[13], buf.V[15]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[13]); + buf.V[12] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[12], buf.V[13]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[12]); + buf.V[4] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[4], buf.V[12]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[4]); + buf.V[5] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[5], buf.V[4]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[5]); + buf.V[7] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[7], buf.V[5]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[7]); + buf.V[6] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[6], buf.V[7]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[6]); + buf.V[2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[2], buf.V[6]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[2]); + buf.V[3] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[3], buf.V[2]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[3]); + buf.V[1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[1], buf.V[3]); + + pre.V[0][i / 2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[1]); + t = PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(buf.V[0], buf.V[1]); + + if (i != GFBITS - 1) { + out128[i + 1][0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract2x(t, 1); + } + out128[i + 0][0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract2x(t, 0); + + } + + // + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE6960119F_AVX_vec128_mul(tmp, pre.v[0], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = tmp[b]; + } + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE6960119F_AVX_vec128_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE6960119F_AVX_vec128_mul(tmp, pre.v[i], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(out128[b][1], tmp[b]); + } + } + + for (b = 0; b < GFBITS; b++) { + v[0] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(out128[b][0], 0); + v[1] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(out128[b][0], 1); + v[2] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(out128[b][1], 0); + v[3] = PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(out128[b][1], 1); + + out[b] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +/* justifying the length of the output */ +static void postprocess(vec256 *out) { + int i; + uint64_t v[4]; + + for (i = 0; i < 13; i++) { + v[0] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(out[i], 0); + v[1] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(out[i], 1); + v[2] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(out[i], 2); + v[3] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(out[i], 3); + + v[3] <<= (128 - SYS_T) * 2; + v[3] >>= (128 - SYS_T) * 2; + + out[i] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + + +void PQCLEAN_MCELIECE6960119F_AVX_fft_tr(vec256 *out, vec256 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); + + postprocess(out); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/fft_tr.h b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/fft_tr.h new file mode 100644 index 0000000000..464413bc0a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_FFT_TR_H +#define PQCLEAN_MCELIECE6960119F_AVX_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE6960119F_AVX_fft_tr(vec256 * /*out*/, vec256 /*in*/[][ GFBITS ]); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/gf.c b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/gf.c new file mode 100644 index 0000000000..a9949c3086 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/gf.c @@ -0,0 +1,203 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE6960119F_AVX_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* field multiplication */ +gf PQCLEAN_MCELIECE6960119F_AVX_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* square twice */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE6960119F_AVX_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // ^11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // ^1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // ^11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // ^111111111111 + + return gf_sqmul(out, num); // ^1111111111110 = ^-1 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE6960119F_AVX_gf_inv(gf in) { + return PQCLEAN_MCELIECE6960119F_AVX_gf_frac(in, ((gf) 1)); +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE6960119F_AVX_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[237]; + + for (i = 0; i < 237; i++) { + prod[i] = 0; + } + + for (i = 0; i < 119; i++) { + for (j = 0; j < 119; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE6960119F_AVX_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 236; i >= 119; i--) { + prod[i - 117] ^= PQCLEAN_MCELIECE6960119F_AVX_gf_mul(prod[i], (gf) 6400); + prod[i - 119] ^= PQCLEAN_MCELIECE6960119F_AVX_gf_mul(prod[i], (gf) 3134); + } + + for (i = 0; i < 119; i++) { + out[i] = prod[i]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/gf.h b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/gf.h new file mode 100644 index 0000000000..fc4686cfac --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/gf.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_GF_H +#define PQCLEAN_MCELIECE6960119F_AVX_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE6960119F_AVX_gf_iszero(gf a); +gf PQCLEAN_MCELIECE6960119F_AVX_gf_add(gf in0, gf in1); +gf PQCLEAN_MCELIECE6960119F_AVX_gf_mul(gf in0, gf in1); +uint64_t PQCLEAN_MCELIECE6960119F_AVX_gf_mul2(gf a, gf b0, gf b1); +gf PQCLEAN_MCELIECE6960119F_AVX_gf_frac(gf den, gf num); +gf PQCLEAN_MCELIECE6960119F_AVX_gf_inv(gf in); + +void PQCLEAN_MCELIECE6960119F_AVX_GF_mul(gf *out, const gf *in0, const gf *in1); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/int32_sort.c b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/int32_sort.c new file mode 100644 index 0000000000..73bf75a2d6 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/int32_sort.c @@ -0,0 +1,1211 @@ +#include "int32_sort.h" +#define int32 int32_t + +#include + +typedef __m256i int32x8; +#define int32x8_load(z) _mm256_loadu_si256((__m256i_u *) (z)) +#define int32x8_store(z,i) _mm256_storeu_si256((__m256i_u *) (z),(i)) +#define int32x8_min _mm256_min_epi32 +#define int32x8_max _mm256_max_epi32 + +#define int32x8_MINMAX(a,b) \ + do { \ + int32x8 c = int32x8_min(a, b); \ + (b) = int32x8_max(a,b); \ + (a) = c; \ + } while(0) + +static inline void int32_MINMAX(int32 *a, int32 *b) { + int32 ab = *b ^ *a; + int32 c = *b - *a; + c ^= ab & (c ^ *b); + c >>= 31; + c &= ab; + *a ^= c; + *b ^= c; +} + +static void minmax_vector(int32 *x, int32 *y, size_t n) { + if (n < 8) { + while (n > 0) { + int32_MINMAX(x, y); + ++x; + ++y; + --n; + } + return; + } + if (n & 7) { + int32x8 x0 = int32x8_load(x + n - 8); + int32x8 y0 = int32x8_load(y + n - 8); + int32x8_MINMAX(x0, y0); + int32x8_store(x + n - 8, x0); + int32x8_store(y + n - 8, y0); + n &= ~7; + } + do { + int32x8 x0 = int32x8_load(x); + int32x8 y0 = int32x8_load(y); + int32x8_MINMAX(x0, y0); + int32x8_store(x, x0); + int32x8_store(y, y0); + x += 8; + y += 8; + n -= 8; + } while (n); +} + +/* stages 8,4,2,1 of size-16 bitonic merging */ +static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) { + int32x8 b0, b1, c0, c1, mask; + + int32x8_MINMAX(x0, x1); + + b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + + int32x8_MINMAX(b0, b1); + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */ + + x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */ + x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */ + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + } + + int32x8_store(&x[0], x0); + int32x8_store(&x[8], x1); +} + +/* stages 64,32 of bitonic merging; n is multiple of 128 */ +static void int32_twostages_32(int32 *x, size_t n) { + size_t i; + + while (n > 0) { + for (i = 0; i < 32; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + 32]); + int32x8 x2 = int32x8_load(&x[i + 64]); + int32x8 x3 = int32x8_load(&x[i + 96]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 32], x1); + int32x8_store(&x[i + 64], x2); + int32x8_store(&x[i + 96], x3); + } + x += 128; + n -= 128; + } +} + +/* stages 4q,2q,q of bitonic merging */ +static size_t int32_threestages(int32 *x, size_t n, size_t q) { + size_t k, i; + + for (k = 0; k + 8 * q <= n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + + return k; +} + +/* n is a power of 2; n >= 8; if n == 8 then flagdown */ +// NOLINTNEXTLINE(google-readability-function-size) +static void int32_sort_2power(int32 *x, size_t n, int flagdown) { + size_t p, q, i, j, k; + int32x8 mask; + + if (n == 8) { + int32 x0 = x[0]; + int32 x1 = x[1]; + int32 x2 = x[2]; + int32 x3 = x[3]; + int32 x4 = x[4]; + int32 x5 = x[5]; + int32 x6 = x[6]; + int32 x7 = x[7]; + + /* odd-even sort instead of bitonic sort */ + + int32_MINMAX(&x1, &x0); + int32_MINMAX(&x3, &x2); + int32_MINMAX(&x2, &x0); + int32_MINMAX(&x3, &x1); + int32_MINMAX(&x2, &x1); + + int32_MINMAX(&x5, &x4); + int32_MINMAX(&x7, &x6); + int32_MINMAX(&x6, &x4); + int32_MINMAX(&x7, &x5); + int32_MINMAX(&x6, &x5); + + int32_MINMAX(&x4, &x0); + int32_MINMAX(&x6, &x2); + int32_MINMAX(&x4, &x2); + + int32_MINMAX(&x5, &x1); + int32_MINMAX(&x7, &x3); + int32_MINMAX(&x5, &x3); + + int32_MINMAX(&x2, &x1); + int32_MINMAX(&x4, &x3); + int32_MINMAX(&x6, &x5); + + x[0] = x0; + x[1] = x1; + x[2] = x2; + x[3] = x3; + x[4] = x4; + x[5] = x5; + x[6] = x6; + x[7] = x7; + return; + } + + if (n == 16) { + int32x8 x0, x1, b0, b1, c0, c1; + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + + mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1); + + x0 ^= mask; /* A01234567 */ + x1 ^= mask; /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + c0 ^= mask; + c1 ^= mask; + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + b0 ^= mask; + b1 ^= mask; + + c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */ + c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */ + b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + mask = _mm256_set1_epi32(-1); + if (flagdown) { + x1 ^= mask; + } else { + x0 ^= mask; + } + + merge16_finish(x, x0, x1, flagdown); + return; + } + + if (n == 32) { + int32x8 x0, x1, x2, x3; + + int32_sort_2power(x, 16, 1); + int32_sort_2power(x + 16, 16, 0); + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + x2 = int32x8_load(&x[16]); + x3 = int32x8_load(&x[24]); + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + } + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + + merge16_finish(x, x0, x1, flagdown); + merge16_finish(x + 16, x2, x3, flagdown); + return; + } + + p = n >> 3; + for (i = 0; i < p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x2 = int32x8_load(&x[i + 2 * p]); + int32x8 x4 = int32x8_load(&x[i + 4 * p]); + int32x8 x6 = int32x8_load(&x[i + 6 * p]); + + /* odd-even stage instead of bitonic stage */ + + int32x8_MINMAX(x4, x0); + int32x8_MINMAX(x6, x2); + int32x8_MINMAX(x2, x0); + int32x8_MINMAX(x6, x4); + int32x8_MINMAX(x2, x4); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 2 * p], x2); + int32x8_store(&x[i + 4 * p], x4); + int32x8_store(&x[i + 6 * p], x6); + + int32x8 x1 = int32x8_load(&x[i + p]); + int32x8 x3 = int32x8_load(&x[i + 3 * p]); + int32x8 x5 = int32x8_load(&x[i + 5 * p]); + int32x8 x7 = int32x8_load(&x[i + 7 * p]); + + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x5, x3); + + int32x8_store(&x[i + p], x1); + int32x8_store(&x[i + 3 * p], x3); + int32x8_store(&x[i + 5 * p], x5); + int32x8_store(&x[i + 7 * p], x7); + } + + if (n >= 128) { + int flip, flipflip; + + mask = _mm256_set1_epi32(-1); + + for (j = 0; j < n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 16]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 16], x1); + } + + p = 8; + for (;;) { /* for p in [8, 16, ..., n/16] */ + q = p >> 1; + while (q >= 128) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + if (q == 64) { + int32_twostages_32(x, n); + q = 16; + } + if (q == 32) { + q = 8; + for (k = 0; k < n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q = 4; + } + if (q == 16) { + q = 8; + for (k = 0; k < n; k += 4 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q = 4; + } + if (q == 8) { + for (k = 0; k < n; k += q + q) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + } + } + + q = n >> 3; + flip = 0; + if (p << 1 == q) { + flip = 1; + } + flipflip = 1 - flip; + for (j = 0; j < q; j += p + p) { + for (k = j; k < j + p + p; k += p) { + for (i = k; i < k + p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + if (flip) { + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + x4 ^= mask; + x5 ^= mask; + x6 ^= mask; + x7 ^= mask; + } + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + flip ^= 1; + } + flip ^= flipflip; + } + + if (p << 4 == n) { + break; + } + p <<= 1; + } + } + + for (p = 4; p >= 1; p >>= 1) { + int32 *z = x; + int32 *target = x + n; + if (p == 4) { + mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&z[0], x0); + int32x8_store(&z[8], x1); + z += 16; + } + } else if (p == 2) { + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8_MINMAX(b0, b1); + int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20); + int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31); + int32x8_store(&z[0], c0); + int32x8_store(&z[8], c1); + z += 16; + } + } else { /* p == 1 */ + mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */ + int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */ + int32x8_MINMAX(d0, d1); + int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20); + int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31); + int32x8_store(&z[0], e0); + int32x8_store(&z[8], e1); + z += 16; + } + } + + q = n >> 4; + while (q >= 128 || q == 32) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (k = j; k < j + q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += 2 * q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (k = 0; k < q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + int32x8 x4 = int32x8_load(&x[k + 4 * q]); + int32x8 x5 = int32x8_load(&x[k + 5 * q]); + int32x8 x6 = int32x8_load(&x[k + 6 * q]); + int32x8 x7 = int32x8_load(&x[k + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + int32x8_store(&x[k + 4 * q], x4); + int32x8_store(&x[k + 5 * q], x5); + int32x8_store(&x[k + 6 * q], x6); + int32x8_store(&x[k + 7 * q], x7); + } + } + + /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */ + mask = _mm256_set1_epi32(-1); + + for (i = 0; i < n; i += 64) { + int32x8 a0 = int32x8_load(&x[i]); + int32x8 a1 = int32x8_load(&x[i + 8]); + int32x8 a2 = int32x8_load(&x[i + 16]); + int32x8 a3 = int32x8_load(&x[i + 24]); + int32x8 a4 = int32x8_load(&x[i + 32]); + int32x8 a5 = int32x8_load(&x[i + 40]); + int32x8 a6 = int32x8_load(&x[i + 48]); + int32x8 a7 = int32x8_load(&x[i + 56]); + + int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */ + int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */ + int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */ + int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */ + int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */ + int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */ + int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */ + + if (flagdown) { + c2 ^= mask; + c3 ^= mask; + c6 ^= mask; + c7 ^= mask; + } else { + c0 ^= mask; + c1 ^= mask; + c4 ^= mask; + c5 ^= mask; + } + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */ + int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */ + int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */ + + int32x8_MINMAX(d0, d1); + int32x8_MINMAX(d2, d3); + int32x8_MINMAX(d4, d5); + int32x8_MINMAX(d6, d7); + int32x8_MINMAX(d0, d2); + int32x8_MINMAX(d1, d3); + int32x8_MINMAX(d4, d6); + int32x8_MINMAX(d5, d7); + int32x8_MINMAX(d0, d4); + int32x8_MINMAX(d1, d5); + int32x8_MINMAX(d2, d6); + int32x8_MINMAX(d3, d7); + + int32x8 e0 = _mm256_unpacklo_epi32(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi32(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi32(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi32(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi32(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi32(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi32(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi32(d6, d7); + + int32x8 f0 = _mm256_unpacklo_epi64(e0, e2); + int32x8 f1 = _mm256_unpacklo_epi64(e1, e3); + int32x8 f2 = _mm256_unpackhi_epi64(e0, e2); + int32x8 f3 = _mm256_unpackhi_epi64(e1, e3); + int32x8 f4 = _mm256_unpacklo_epi64(e4, e6); + int32x8 f5 = _mm256_unpacklo_epi64(e5, e7); + int32x8 f6 = _mm256_unpackhi_epi64(e4, e6); + int32x8 f7 = _mm256_unpackhi_epi64(e5, e7); + + int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20); + int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20); + int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20); + int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20); + int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31); + int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31); + int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31); + int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31); + + int32x8_store(&x[i], g0); + int32x8_store(&x[i + 8], g1); + int32x8_store(&x[i + 16], g2); + int32x8_store(&x[i + 24], g3); + int32x8_store(&x[i + 32], g4); + int32x8_store(&x[i + 40], g5); + int32x8_store(&x[i + 48], g6); + int32x8_store(&x[i + 56], g7); + } + + q = n >> 4; + while (q >= 128 || q == 32) { + q >>= 2; + for (j = 0; j < n; j += 8 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q >>= 1; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += q + q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (i = 0; i < q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */ + int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */ + int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */ + int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */ + int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */ + int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */ + int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */ + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */ + int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */ + int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */ + + if (flagdown) { + d0 ^= mask; + d1 ^= mask; + d2 ^= mask; + d3 ^= mask; + d4 ^= mask; + d5 ^= mask; + d6 ^= mask; + d7 ^= mask; + } + + int32x8_store(&x[i], d0); + int32x8_store(&x[i + q], d4); + int32x8_store(&x[i + 2 * q], d1); + int32x8_store(&x[i + 3 * q], d5); + int32x8_store(&x[i + 4 * q], d2); + int32x8_store(&x[i + 5 * q], d6); + int32x8_store(&x[i + 6 * q], d3); + int32x8_store(&x[i + 7 * q], d7); + } +} + +void PQCLEAN_MCELIECE6960119F_AVX_int32_sort(int32 *x, size_t n) { + size_t q, i, j; + + if (n <= 8) { + if (n == 8) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + int32_MINMAX(&x[6], &x[7]); + } + if (n >= 7) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + } + if (n >= 6) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + } + if (n >= 5) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + } + if (n >= 4) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + } + if (n >= 3) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + } + if (n >= 2) { + int32_MINMAX(&x[0], &x[1]); + } + return; + } + + if (!(n & (n - 1))) { + int32_sort_2power(x, n, 0); + return; + } + + q = 8; + while (q < n - q) { + q += q; + } + /* n > q >= 8 */ + + if (q <= 128) { /* n <= 256 */ + int32x8 y[32]; + for (i = q >> 3; i < q >> 2; ++i) { + y[i] = _mm256_set1_epi32(0x7fffffff); + } + for (i = 0; i < n; ++i) { + ((int32 *)y)[i] = x[i]; + } + int32_sort_2power((int32 *) y, 2 * q, 0); + for (i = 0; i < n; ++i) { + x[i] = ((int32 *) y)[i]; + } + return; + } + + int32_sort_2power(x, q, 1); + PQCLEAN_MCELIECE6960119F_AVX_int32_sort(x + q, n - q); + + while (q >= 64) { + q >>= 2; + j = int32_threestages(x, n, q); + minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j); + if (j + 4 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + j += 4 * q; + } + minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j); + if (j + 2 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + } + j += 2 * q; + } + minmax_vector(x + j, x + j + q, n - q - j); + q >>= 1; + } + if (q == 32) { + j = 0; + for (; j + 64 <= n; j += 64) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8 x4 = int32x8_load(&x[j + 32]); + int32x8 x5 = int32x8_load(&x[j + 40]); + int32x8 x6 = int32x8_load(&x[j + 48]); + int32x8 x7 = int32x8_load(&x[j + 56]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20); + int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31); + int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20); + int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8_MINMAX(a4, a5); + int32x8_MINMAX(a6, a7); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20); + int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31); + int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20); + int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8 c4 = _mm256_unpacklo_epi64(b4, b5); + int32x8 c5 = _mm256_unpackhi_epi64(b4, b5); + int32x8 c6 = _mm256_unpacklo_epi64(b6, b7); + int32x8 c7 = _mm256_unpackhi_epi64(b6, b7); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8_MINMAX(c4, c5); + int32x8_MINMAX(c6, c7); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 d4 = _mm256_unpacklo_epi32(c4, c5); + int32x8 d5 = _mm256_unpackhi_epi32(c4, c5); + int32x8 d6 = _mm256_unpacklo_epi32(c6, c7); + int32x8 d7 = _mm256_unpackhi_epi32(c6, c7); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi64(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi64(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi64(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi64(d6, d7); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8_MINMAX(e4, e5); + int32x8_MINMAX(e6, e7); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8 f4 = _mm256_unpacklo_epi32(e4, e5); + int32x8 f5 = _mm256_unpackhi_epi32(e4, e5); + int32x8 f6 = _mm256_unpacklo_epi32(e6, e7); + int32x8 f7 = _mm256_unpackhi_epi32(e6, e7); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + int32x8_store(&x[j + 32], f4); + int32x8_store(&x[j + 40], f5); + int32x8_store(&x[j + 48], f6); + int32x8_store(&x[j + 56], f7); + } + minmax_vector(x + j, x + j + 32, n - 32 - j); + goto continue16; + } + if (q == 16) { + j = 0; +continue16: + for (; j + 32 <= n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + } + minmax_vector(x + j, x + j + 16, n - 16 - j); + goto continue8; + } + /* q == 8 */ + j = 0; +continue8: + for (; j + 16 <= n; j += 16) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 8], x1); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */ + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */ + int32x8_MINMAX(a0, a1); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */ + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */ + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */ + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */ + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */ + int32x8_MINMAX(e0, e1); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */ + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */ + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + } + minmax_vector(x + j, x + j + 8, n - 8 - j); + if (j + 8 <= n) { + int32_MINMAX(&x[j], &x[j + 4]); + int32_MINMAX(&x[j + 1], &x[j + 5]); + int32_MINMAX(&x[j + 2], &x[j + 6]); + int32_MINMAX(&x[j + 3], &x[j + 7]); + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + int32_MINMAX(&x[j + 4], &x[j + 6]); + int32_MINMAX(&x[j + 5], &x[j + 7]); + int32_MINMAX(&x[j + 4], &x[j + 5]); + int32_MINMAX(&x[j + 6], &x[j + 7]); + j += 8; + } + minmax_vector(x + j, x + j + 4, n - 4 - j); + if (j + 4 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + j += 4; + } + if (j + 3 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + } + if (j + 2 <= n) { + int32_MINMAX(&x[j], &x[j + 1]); + } +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/int32_sort.h b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/int32_sort.h new file mode 100644 index 0000000000..8832689793 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/int32_sort.h @@ -0,0 +1,9 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_INT32_SORT_H +#define PQCLEAN_MCELIECE6960119F_AVX_INT32_SORT_H + +#include +#include + +void PQCLEAN_MCELIECE6960119F_AVX_int32_sort(int32_t *x, size_t n); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/operations.c b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/operations.c new file mode 100644 index 0000000000..141987d887 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE6960119F_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE6960119F_AVX_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE6960119F_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE6960119F_AVX_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE6960119F_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE6960119F_AVX_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE6960119F_AVX_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE6960119F_AVX_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE6960119F_AVX_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE6960119F_AVX_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE6960119F_AVX_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE6960119F_AVX_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE6960119F_AVX_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/params.h b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/params.h new file mode 100644 index 0000000000..b97615e42b --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_PARAMS_H +#define PQCLEAN_MCELIECE6960119F_AVX_PARAMS_H + +#define GFBITS 13 +#define SYS_N 6960 +#define SYS_T 119 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/pk_gen.c b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/pk_gen.c new file mode 100644 index 0000000000..4f8e15296e --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/pk_gen.c @@ -0,0 +1,372 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void de_bitslicing(uint64_t *out, vec256 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 256 + 0 * 64 + r] <<= 1; + out[i * 256 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 256 + 1 * 64 + r] <<= 1; + out[i * 256 + 1 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i][j], 2); + for (r = 0; r < 64; r++) { + out[i * 256 + 2 * 64 + r] <<= 1; + out[i * 256 + 2 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(in[i][j], 3); + for (r = 0; r < 64; r++) { + out[i * 256 + 3 * 64 + r] <<= 1; + out[i * 256 + 3 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec256 out0[][GFBITS], vec256 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[4] = {0}; + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + } +} + +static void transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + return (int)_tzcnt_u64(in); +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint64_t mat[][ ((SYS_N + 255) / 256) * 4 ], uint32_t *perm) { + int i, j, k, s, block_idx, row, tail; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 64; + tail = row % 64; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = (mat[ row + i ][ block_idx + 0 ] >> tail) | + (mat[ row + i ][ block_idx + 1 ] << (64 - tail)); + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask(k, ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = (mat[ i + j ][ block_idx + 0 ] >> tail) | + (mat[ i + j ][ block_idx + 1 ] << (64 - tail)); + } + + transpose_64x64(buf, buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask(k, ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + transpose_64x64(buf, buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + mat[ i + j ][ block_idx + 0 ] = (mat[ i + j ][ block_idx + 0 ] << (64 - tail) >> (64 - tail)) | (buf[j] << tail); + mat[ i + j ][ block_idx + 1 ] = (mat[ i + j ][ block_idx + 1 ] >> tail << tail) | (buf[j] >> (64 - tail)); + } + } + + return 0; +} + +#define NBLOCKS1_H ((SYS_N + 63) / 64) +#define NBLOCKS2_H ((SYS_N + 255) / 256) +#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) +int PQCLEAN_MCELIECE6960119F_AVX_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + const int block_idx = NBLOCKS1_I - 1; + int tail = (GFBITS * SYS_T) % 64; + + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ NBLOCKS2_H * 4 ]; + + uint64_t mask; + + vec128 sk_int[ GFBITS ]; + + vec256 consts[ 32 ][ GFBITS ]; + vec256 eval[ 32 ][ GFBITS ]; + vec256 prod[ 32 ][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + uint64_t one_row[ NBLOCKS2_H * 4 ]; + + // compute the inverses + + PQCLEAN_MCELIECE6960119F_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE6960119F_AVX_fft(eval, sk_int); + + PQCLEAN_MCELIECE6960119F_AVX_vec256_copy(prod[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE6960119F_AVX_vec256_inv(tmp, prod[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE6960119F_AVX_vec256_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE6960119F_AVX_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + // gaussian elimination + + for (row = 0; row < PK_NROWS; row++) { + i = row >> 6; + j = row & 63; + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < row; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < NBLOCKS1_H; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + + for (row = 0; row < PK_NROWS; row++) { + for (k = block_idx; k < NBLOCKS1_H; k++) { + one_row[k] = mat[ row ][k]; + } + + for (k = block_idx; k < NBLOCKS1_H - 1; k++) { + one_row[k] = (one_row[k] >> tail) | (one_row[k + 1] << (64 - tail)); + PQCLEAN_MCELIECE6960119F_AVX_store8(pk, one_row[k]); + pk += 8; + } + + one_row[k] >>= tail; + PQCLEAN_MCELIECE6960119F_AVX_store_i(pk, one_row[k], PK_ROW_BYTES % 8); + + pk[ (PK_ROW_BYTES % 8) - 1 ] &= (1 << (PK_NCOLS % 8)) - 1; // removing redundant bits + + pk += PK_ROW_BYTES % 8; + } + + // + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/pk_gen.h b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/pk_gen.h new file mode 100644 index 0000000000..ceddc05b7a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_PK_GEN_H +#define PQCLEAN_MCELIECE6960119F_AVX_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE6960119F_AVX_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/scalars_2x.inc b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/scalars_2x.inc new file mode 100644 index 0000000000..2f9b7474de --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/scalars_2x.inc @@ -0,0 +1,75 @@ +{ + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000) +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/scalars_4x.inc b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/scalars_4x.inc new file mode 100644 index 0000000000..b897c2c542 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/scalars_4x.inc @@ -0,0 +1,91 @@ +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330, 0XF0F30C33CF03F03F, 0X3F30CC0C000F3FCC), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C, 0X00F30FC00C3300FF, 0XFC3CF030FC3FFF03), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC, 0XF3CC3CF3F3FCF33F, 0X33FFFCFF0CCF3CC3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30, 0X3C0FC0FC303C3F3C, 0X003CFF33C3CC30CF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC, 0XFC30CF303F3FF00F, 0XCFF3CF33C00F3003), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F, 0X33300C0CC3300CF3, 0X00F3CC0CF3003CCF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF, 0X3C030CF3F03FF3F3, 0X3C000CFCCC3C3333), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3, 0X3CCC03FCCC3FFC03, 0XF3CF03C0FCF03FF0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030, 0X033C3C3CF0003FC3, 0X3F3C3CF0C330330C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC, 0XFFC0FF00F0FF0F03, 0X33CCFCC0FF0033F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3, 0XF3F30CF003FCC303, 0X33C300C0F0C003F3), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF, 0X30CFCFC3CC0F3000, 0X003FF0003F00C00C), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33, 0X0CF30CCF3FCFCC0F, 0XCFF3C3033F030FFF) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00, 0X0F0F00FF0FF0FFFF, 0XFF0F0F00F000F0FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF, 0XF000F0F00F00FF0F, 0X0FFFFFFFFF00000F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF, 0X000FFFF0FFF0FF0F, 0XF0FFFF000F00F0FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F, 0X00F00FFF00000FF0, 0X0F0000F00FFF0FFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFF0000F0FF000F0, 0XFF000000FFF00000, 0XFFFFF0000FFFF00F, 0X0F0F0F00FF0F000F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0, 0XFFF0FFF0000FFFF0, 0X000F0F0FFFF0F000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F, 0XF0F0F0000F0F0F00, 0XF0FFFF0F00F0FF0F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000, 0X00F000F0F00FFF00, 0X0F0F000F0F00F0FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00F000F0FFF00F00, 0X00F00F00F00F000F, 0XF0FF0F0FFF00F0FF, 0X0000F0FF00FF0F0F), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0, 0XF0FF0FFFF0F0F0FF, 0X00FFFF0FF0FFF0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF, 0X00FFFFFFFFFFFFF0, 0X0000000F00F0FFF0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF, 0X00FFF0F0FF000F0F, 0XF0F00000FF00F0F0), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00, 0X000FFFF0000FFF00, 0X0F0F0FFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00FF0000000000FF, 0X00FF00FF00FF0000, 0XFFFF00FF00FF00FF, 0XFF0000FFFFFF00FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF, 0X00FFFF000000FF00, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FF00FF0000, 0X0000FFFF000000FF, 0XFFFF00FFFFFFFF00, 0XFFFF000000FFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF000000FF0000, 0X00FFFF00FF000000, 0X0000FFFF00FFFFFF, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF, 0X00FF0000FF0000FF, 0XFFFFFF00FFFFFF00), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00, 0XFFFF0000FF00FFFF, 0X00FFFF00FFFF00FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00, 0XFF000000FFFFFF00, 0X0000FFFF00FF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF, 0X000000000000FFFF, 0X000000FFFF000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00FFFFFFFF00FF00, 0X0000FF0000000000, 0XFF00FF00FFFF0000, 0XFF00FF0000FF00FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF000000000000, 0XFF00FFFF00FFFF00, 0XFFFF00FFFF00FFFF, 0X00FF0000000000FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00FF0000FF000000, 0X00FFFF00000000FF, 0XFFFFFFFFFF00FF00, 0XFF00FFFF00FF00FF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF, 0XFFFF00FFFF0000FF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000, 0X0000FF00000000FF, 0X0000FF000000FFFF) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X000000000000FFFF, 0X0000FFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000FFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF, 0XFFFFFFFF00000000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFF000000000000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000000000000000, 0X0000FFFF00000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000, 0X00000000FFFF0000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFF0000FFFF0000, 0X0000FFFFFFFFFFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0X0000FFFFFFFFFFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFFFFFF00000000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF, 0X000000000000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF000000000000, 0X00000000FFFF0000, 0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFFFFFF0000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFF0000FFFF, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFF0000FFFFFFFF, 0X0000FFFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000) +}, +{ + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF) +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/sk_gen.c b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/sk_gen.c new file mode 100644 index 0000000000..21a02ac477 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE6960119F_AVX_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE6960119F_AVX_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE6960119F_AVX_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE6960119F_AVX_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE6960119F_AVX_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE6960119F_AVX_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE6960119F_AVX_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE6960119F_AVX_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/sk_gen.h b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/sk_gen.h new file mode 100644 index 0000000000..025f23c214 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_SK_GEN_H +#define PQCLEAN_MCELIECE6960119F_AVX_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE6960119F_AVX_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE6960119F_AVX_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/syndrome_asm.S b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/syndrome_asm.S new file mode 100644 index 0000000000..b5cabf8fd1 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/syndrome_asm.S @@ -0,0 +1,921 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: reg256 pp + +# qhasm: reg256 ee + +# qhasm: reg256 ss + +# qhasm: int64 b0 + +# qhasm: int64 b1 + +# qhasm: int64 i + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: int64 tmp + +# qhasm: stack64 back + +# qhasm: int64 buf_ptr + +# qhasm: stack256 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6960119F_AVX_syndrome_asm +.global PQCLEAN_MCELIECE6960119F_AVX_syndrome_asm +_PQCLEAN_MCELIECE6960119F_AVX_syndrome_asm: +PQCLEAN_MCELIECE6960119F_AVX_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $64,%r11 +sub %r11,%rsp + +# qhasm: input_2 += 193 +# asm 1: add $193,tmp=int64#4 +# asm 2: movzbq 0(tmp=%rcx +movzbq 0(%rdx),%rcx + +# qhasm: back = tmp +# asm 1: movq back=stack64#1 +# asm 2: movq back=32(%rsp) +movq %rcx,32(%rsp) + +# qhasm: i = 0 +# asm 1: mov $0,>i=int64#4 +# asm 2: mov $0,>i=%rcx +mov $0,%rcx + +# qhasm: inner1: +._inner1: + +# qhasm: addr = input_2 + i +# asm 1: lea (addr=int64#5 +# asm 2: lea (addr=%r8 +lea (%rdx,%rcx),%r8 + +# qhasm: b0 = *(uint8 *) (addr + 0) +# asm 1: movzbq 0(b0=int64#6 +# asm 2: movzbq 0(b0=%r9 +movzbq 0(%r8),%r9 + +# qhasm: b1 = *(uint8 *) (addr + 1) +# asm 1: movzbq 1(b1=int64#7 +# asm 2: movzbq 1(b1=%rax +movzbq 1(%r8),%rax + +# qhasm: (uint64) b0 >>= 3 +# asm 1: shr $3,b0=int64#4 +# asm 2: movzbq 1(b0=%rcx +movzbq 1(%r8),%rcx + +# qhasm: (uint64) b0 >>= 3 +# asm 1: shr $3,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 1547 +# asm 1: mov $1547,>row=int64#5 +# asm 2: mov $1547,>row=%r8 +mov $1547,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(ee=reg256#2 +# asm 2: vmovupd 0(ee=%ymm1 +vmovupd 0(%rdx),%ymm1 + +# qhasm: ss &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 32(pp=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(ee=reg256#3 +# asm 2: vmovupd 32(ee=%ymm2 +vmovupd 32(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 64(pp=%ymm1 +vmovupd 64(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(ee=reg256#3 +# asm 2: vmovupd 64(ee=%ymm2 +vmovupd 64(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 96(pp=%ymm1 +vmovupd 96(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 96 ] +# asm 1: vmovupd 96(ee=reg256#3 +# asm 2: vmovupd 96(ee=%ymm2 +vmovupd 96(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 128(pp=%ymm1 +vmovupd 128(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 128 ] +# asm 1: vmovupd 128(ee=reg256#3 +# asm 2: vmovupd 128(ee=%ymm2 +vmovupd 128(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 160(pp=%ymm1 +vmovupd 160(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 160 ] +# asm 1: vmovupd 160(ee=reg256#3 +# asm 2: vmovupd 160(ee=%ymm2 +vmovupd 160(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 192(pp=%ymm1 +vmovupd 192(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 192 ] +# asm 1: vmovupd 192(ee=reg256#3 +# asm 2: vmovupd 192(ee=%ymm2 +vmovupd 192(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 224(pp=%ymm1 +vmovupd 224(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 224 ] +# asm 1: vmovupd 224(ee=reg256#3 +# asm 2: vmovupd 224(ee=%ymm2 +vmovupd 224(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 256(pp=%ymm1 +vmovupd 256(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 256 ] +# asm 1: vmovupd 256(ee=reg256#3 +# asm 2: vmovupd 256(ee=%ymm2 +vmovupd 256(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 288(pp=%ymm1 +vmovupd 288(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 288 ] +# asm 1: vmovupd 288(ee=reg256#3 +# asm 2: vmovupd 288(ee=%ymm2 +vmovupd 288(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 320(pp=%ymm1 +vmovupd 320(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 320 ] +# asm 1: vmovupd 320(ee=reg256#3 +# asm 2: vmovupd 320(ee=%ymm2 +vmovupd 320(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 352(pp=%ymm1 +vmovupd 352(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 352 ] +# asm 1: vmovupd 352(ee=reg256#3 +# asm 2: vmovupd 352(ee=%ymm2 +vmovupd 352(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 384(pp=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 384 ] +# asm 1: vmovupd 384(ee=reg256#3 +# asm 2: vmovupd 384(ee=%ymm2 +vmovupd 384(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 416(pp=%ymm1 +vmovupd 416(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 416 ] +# asm 1: vmovupd 416(ee=reg256#3 +# asm 2: vmovupd 416(ee=%ymm2 +vmovupd 416(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 448(pp=%ymm1 +vmovupd 448(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 448 ] +# asm 1: vmovupd 448(ee=reg256#3 +# asm 2: vmovupd 448(ee=%ymm2 +vmovupd 448(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 480(pp=%ymm1 +vmovupd 480(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 480 ] +# asm 1: vmovupd 480(ee=reg256#3 +# asm 2: vmovupd 480(ee=%ymm2 +vmovupd 480(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 512(pp=%ymm1 +vmovupd 512(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 512 ] +# asm 1: vmovupd 512(ee=reg256#3 +# asm 2: vmovupd 512(ee=%ymm2 +vmovupd 512(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 544(pp=%ymm1 +vmovupd 544(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 544 ] +# asm 1: vmovupd 544(ee=reg256#3 +# asm 2: vmovupd 544(ee=%ymm2 +vmovupd 544(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 576(pp=%ymm1 +vmovupd 576(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 576 ] +# asm 1: vmovupd 576(ee=reg256#3 +# asm 2: vmovupd 576(ee=%ymm2 +vmovupd 576(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 608(pp=%ymm1 +vmovupd 608(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 608 ] +# asm 1: vmovupd 608(ee=reg256#3 +# asm 2: vmovupd 608(ee=%ymm2 +vmovupd 608(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 640(pp=%ymm1 +vmovupd 640(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 640 ] +# asm 1: vmovupd 640(ee=reg256#3 +# asm 2: vmovupd 640(ee=%ymm2 +vmovupd 640(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand buf=stack256#1 +# asm 2: vmovapd buf=0(%rsp) +vmovapd %ymm0,0(%rsp) + +# qhasm: s = *(uint32 *) (input_1 + 672) +# asm 1: movl 672(s=int64#6d +# asm 2: movl 672(s=%r9d +movl 672(%rsi),%r9d + +# qhasm: e = *(uint32 *) (input_2 + 672) +# asm 1: movl 672(e=int64#7d +# asm 2: movl 672(e=%eax +movl 672(%rdx),%eax + +# qhasm: s &= e +# asm 1: and p=int64#7 +# asm 2: movzbq 676(p=%rax +movzbq 676(%rsi),%rax + +# qhasm: e = *(uint8 *) (input_2 + 676) +# asm 1: movzbq 676(e=int64#8 +# asm 2: movzbq 676(e=%r10 +movzbq 676(%rdx),%r10 + +# qhasm: p &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 16(b64=%rax +movq 16(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 24(b64=%rax +movq 24(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,i=int64#2 +# asm 2: mov $676,>i=%rsi +mov $676,%rsi + +# qhasm: inner2: +._inner2: + +# qhasm: i -= 1 +# asm 1: sub $1,addr=int64#4 +# asm 2: lea (addr=%rcx +lea (%rdx,%rsi),%rcx + +# qhasm: b0 = *(uint8 *) (addr + 0) +# asm 1: movzbq 0(b0=int64#5 +# asm 2: movzbq 0(b0=%r8 +movzbq 0(%rcx),%r8 + +# qhasm: b1 = *(uint8 *) (addr + 1) +# asm 1: movzbq 1(b1=int64#6 +# asm 2: movzbq 1(b1=%r9 +movzbq 1(%rcx),%r9 + +# qhasm: (uint64) b0 >>= 5 +# asm 1: shr $5,tmp=int64#2 +# asm 2: movq tmp=%rsi +movq 32(%rsp),%rsi + +# qhasm: *(uint8 *) (input_2 + 0) = tmp +# asm 1: movb i=int64#2 +# asm 2: mov $0,>i=%rsi +mov $0,%rsi + +# qhasm: inner3: +._inner3: + +# qhasm: s = *(uint8 *) (input_0 + 0) +# asm 1: movzbq 0(s=int64#4 +# asm 2: movzbq 0(s=%rcx +movzbq 0(%rdi),%rcx + +# qhasm: e = *(uint8 *) (input_2 + 0) +# asm 1: movzbq 0(e=int64#5 +# asm 2: movzbq 0(e=%r8 +movzbq 0(%rdx),%r8 + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movzbq 0(s=%rsi +movzbq 0(%rdi),%rsi + +# qhasm: e = *(uint8 *) (input_2 + 0) +# asm 1: movzbq 0(e=int64#3 +# asm 2: movzbq 0(e=%rdx +movzbq 0(%rdx),%rdx + +# qhasm: (uint32) e &= 7 +# asm 1: and $7,mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE6960119F_AVX_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE6960119F_AVX_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE6960119F_AVX_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE6960119F_AVX_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE6960119F_AVX_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE6960119F_AVX_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE6960119F_AVX_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE6960119F_AVX_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE6960119F_AVX_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE6960119F_AVX_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE6960119F_AVX_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK5_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK5_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE6960119F_AVX_MASK5_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK5_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK5_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK5_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE6960119F_AVX_MASK4_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK4_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK4_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK4_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE6960119F_AVX_MASK4_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK4_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK4_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK4_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE6960119F_AVX_MASK3_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK3_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK3_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK3_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE6960119F_AVX_MASK3_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK3_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK3_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK3_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(x1=reg256#8 +# asm 2: vmovupd 256(x1=%ymm7 +vmovupd 256(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 512 ] +# asm 1: vmovupd 512(x2=reg256#9 +# asm 2: vmovupd 512(x2=%ymm8 +vmovupd 512(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 768 ] +# asm 1: vmovupd 768(x3=reg256#10 +# asm 2: vmovupd 768(x3=%ymm9 +vmovupd 768(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1024 ] +# asm 1: vmovupd 1024(x4=reg256#11 +# asm 2: vmovupd 1024(x4=%ymm10 +vmovupd 1024(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1280 ] +# asm 1: vmovupd 1280(x5=reg256#12 +# asm 2: vmovupd 1280(x5=%ymm11 +vmovupd 1280(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1536 ] +# asm 1: vmovupd 1536(x6=reg256#13 +# asm 2: vmovupd 1536(x6=%ymm12 +vmovupd 1536(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1792 ] +# asm 1: vmovupd 1792(x7=reg256#14 +# asm 2: vmovupd 1792(x7=%ymm13 +vmovupd 1792(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 32(x0=%ymm6 +vmovupd 32(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x2=reg256#9 +# asm 2: vmovupd 544(x2=%ymm8 +vmovupd 544(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x3=reg256#10 +# asm 2: vmovupd 800(x3=%ymm9 +vmovupd 800(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x4=reg256#11 +# asm 2: vmovupd 1056(x4=%ymm10 +vmovupd 1056(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x5=reg256#12 +# asm 2: vmovupd 1312(x5=%ymm11 +vmovupd 1312(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x6=reg256#13 +# asm 2: vmovupd 1568(x6=%ymm12 +vmovupd 1568(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x7=reg256#14 +# asm 2: vmovupd 1824(x7=%ymm13 +vmovupd 1824(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 32 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 64(x0=%ymm6 +vmovupd 64(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x1=reg256#8 +# asm 2: vmovupd 320(x1=%ymm7 +vmovupd 320(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x3=reg256#10 +# asm 2: vmovupd 832(x3=%ymm9 +vmovupd 832(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x4=reg256#11 +# asm 2: vmovupd 1088(x4=%ymm10 +vmovupd 1088(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x5=reg256#12 +# asm 2: vmovupd 1344(x5=%ymm11 +vmovupd 1344(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x6=reg256#13 +# asm 2: vmovupd 1600(x6=%ymm12 +vmovupd 1600(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x7=reg256#14 +# asm 2: vmovupd 1856(x7=%ymm13 +vmovupd 1856(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 64 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 96(x0=%ymm6 +vmovupd 96(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x1=reg256#8 +# asm 2: vmovupd 352(x1=%ymm7 +vmovupd 352(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x2=reg256#9 +# asm 2: vmovupd 608(x2=%ymm8 +vmovupd 608(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x4=reg256#11 +# asm 2: vmovupd 1120(x4=%ymm10 +vmovupd 1120(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x5=reg256#12 +# asm 2: vmovupd 1376(x5=%ymm11 +vmovupd 1376(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x6=reg256#13 +# asm 2: vmovupd 1632(x6=%ymm12 +vmovupd 1632(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x7=reg256#14 +# asm 2: vmovupd 1888(x7=%ymm13 +vmovupd 1888(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 96 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 128(x0=%ymm6 +vmovupd 128(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x1=reg256#8 +# asm 2: vmovupd 384(x1=%ymm7 +vmovupd 384(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x2=reg256#9 +# asm 2: vmovupd 640(x2=%ymm8 +vmovupd 640(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x3=reg256#10 +# asm 2: vmovupd 896(x3=%ymm9 +vmovupd 896(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x5=reg256#12 +# asm 2: vmovupd 1408(x5=%ymm11 +vmovupd 1408(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x6=reg256#13 +# asm 2: vmovupd 1664(x6=%ymm12 +vmovupd 1664(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x7=reg256#14 +# asm 2: vmovupd 1920(x7=%ymm13 +vmovupd 1920(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 128 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 160(x0=%ymm6 +vmovupd 160(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x1=reg256#8 +# asm 2: vmovupd 416(x1=%ymm7 +vmovupd 416(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x2=reg256#9 +# asm 2: vmovupd 672(x2=%ymm8 +vmovupd 672(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x3=reg256#10 +# asm 2: vmovupd 928(x3=%ymm9 +vmovupd 928(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x4=reg256#11 +# asm 2: vmovupd 1184(x4=%ymm10 +vmovupd 1184(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x6=reg256#13 +# asm 2: vmovupd 1696(x6=%ymm12 +vmovupd 1696(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x7=reg256#14 +# asm 2: vmovupd 1952(x7=%ymm13 +vmovupd 1952(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 160 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 192(x0=%ymm6 +vmovupd 192(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x1=reg256#8 +# asm 2: vmovupd 448(x1=%ymm7 +vmovupd 448(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x2=reg256#9 +# asm 2: vmovupd 704(x2=%ymm8 +vmovupd 704(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x3=reg256#10 +# asm 2: vmovupd 960(x3=%ymm9 +vmovupd 960(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x4=reg256#11 +# asm 2: vmovupd 1216(x4=%ymm10 +vmovupd 1216(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x5=reg256#12 +# asm 2: vmovupd 1472(x5=%ymm11 +vmovupd 1472(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x7=reg256#14 +# asm 2: vmovupd 1984(x7=%ymm13 +vmovupd 1984(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 192 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 224(x0=%ymm6 +vmovupd 224(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x1=reg256#8 +# asm 2: vmovupd 480(x1=%ymm7 +vmovupd 480(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x2=reg256#9 +# asm 2: vmovupd 736(x2=%ymm8 +vmovupd 736(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x3=reg256#10 +# asm 2: vmovupd 992(x3=%ymm9 +vmovupd 992(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x4=reg256#11 +# asm 2: vmovupd 1248(x4=%ymm10 +vmovupd 1248(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x5=reg256#12 +# asm 2: vmovupd 1504(x5=%ymm11 +vmovupd 1504(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x6=reg256#13 +# asm 2: vmovupd 1760(x6=%ymm12 +vmovupd 1760(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#1 +# asm 2: vpand v00=%ymm0 +vpand %ymm9,%ymm0,%ymm0 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#13 +# asm 2: vpsllq $32,v10=%ymm12 +vpsllq $32,%ymm13,%ymm12 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm0,%ymm12,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#13 +# asm 2: vpslld $16,v10=%ymm12 +vpslld $16,%ymm11,%ymm12 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#14 +# asm 2: vpsrld $16,v01=%ymm13 +vpsrld $16,%ymm14,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm0,%ymm13 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm8,%ymm13 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#3 +# asm 2: vpand v00=%ymm2 +vpand %ymm7,%ymm2,%ymm2 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#9 +# asm 2: vpslld $16,v10=%ymm8 +vpslld $16,%ymm1,%ymm8 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm2,%ymm8,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#8 +# asm 2: vpsllw $8,v10=%ymm7 +vpsllw $8,%ymm12,%ymm7 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#9 +# asm 2: vpsrlw $8,v01=%ymm8 +vpsrlw $8,%ymm9,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#10 +# asm 2: vpsllw $8,v10=%ymm9 +vpsllw $8,%ymm0,%ymm9 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#12 +# asm 2: vpsllw $8,v10=%ymm11 +vpsllw $8,%ymm2,%ymm11 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#11 +# asm 2: vpsrlw $8,v01=%ymm10 +vpsrlw $8,%ymm10,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#5 +# asm 2: vpand v00=%ymm4 +vpand %ymm6,%ymm4,%ymm4 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#11 +# asm 2: vpsllw $8,v10=%ymm10 +vpsllw $8,%ymm1,%ymm10 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm4,%ymm10,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 224 ] = x0 +# asm 1: vmovupd mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK2_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK2_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE6960119F_AVX_MASK2_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK2_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK2_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK2_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE6960119F_AVX_MASK1_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK1_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK1_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK1_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE6960119F_AVX_MASK1_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK1_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK1_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK1_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE6960119F_AVX_MASK0_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK0_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK0_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK0_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE6960119F_AVX_MASK0_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK0_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK0_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE6960119F_AVX_MASK0_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(x1=reg256#8 +# asm 2: vmovupd 32(x1=%ymm7 +vmovupd 32(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(x2=reg256#9 +# asm 2: vmovupd 64(x2=%ymm8 +vmovupd 64(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(x3=reg256#10 +# asm 2: vmovupd 96(x3=%ymm9 +vmovupd 96(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(x4=reg256#11 +# asm 2: vmovupd 128(x4=%ymm10 +vmovupd 128(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(x5=reg256#12 +# asm 2: vmovupd 160(x5=%ymm11 +vmovupd 160(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(x6=reg256#13 +# asm 2: vmovupd 192(x6=%ymm12 +vmovupd 192(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(x7=reg256#14 +# asm 2: vmovupd 224(x7=%ymm13 +vmovupd 224(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 256(x0=%ymm6 +vmovupd 256(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x2=reg256#9 +# asm 2: vmovupd 320(x2=%ymm8 +vmovupd 320(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x3=reg256#10 +# asm 2: vmovupd 352(x3=%ymm9 +vmovupd 352(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x4=reg256#11 +# asm 2: vmovupd 384(x4=%ymm10 +vmovupd 384(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x5=reg256#12 +# asm 2: vmovupd 416(x5=%ymm11 +vmovupd 416(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x6=reg256#13 +# asm 2: vmovupd 448(x6=%ymm12 +vmovupd 448(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x7=reg256#14 +# asm 2: vmovupd 480(x7=%ymm13 +vmovupd 480(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 256 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 512(x0=%ymm6 +vmovupd 512(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x1=reg256#8 +# asm 2: vmovupd 544(x1=%ymm7 +vmovupd 544(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x3=reg256#10 +# asm 2: vmovupd 608(x3=%ymm9 +vmovupd 608(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x4=reg256#11 +# asm 2: vmovupd 640(x4=%ymm10 +vmovupd 640(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x5=reg256#12 +# asm 2: vmovupd 672(x5=%ymm11 +vmovupd 672(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x6=reg256#13 +# asm 2: vmovupd 704(x6=%ymm12 +vmovupd 704(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x7=reg256#14 +# asm 2: vmovupd 736(x7=%ymm13 +vmovupd 736(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 512 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 768(x0=%ymm6 +vmovupd 768(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x1=reg256#8 +# asm 2: vmovupd 800(x1=%ymm7 +vmovupd 800(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x2=reg256#9 +# asm 2: vmovupd 832(x2=%ymm8 +vmovupd 832(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x4=reg256#11 +# asm 2: vmovupd 896(x4=%ymm10 +vmovupd 896(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x5=reg256#12 +# asm 2: vmovupd 928(x5=%ymm11 +vmovupd 928(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x6=reg256#13 +# asm 2: vmovupd 960(x6=%ymm12 +vmovupd 960(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x7=reg256#14 +# asm 2: vmovupd 992(x7=%ymm13 +vmovupd 992(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 768 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1024(x0=%ymm6 +vmovupd 1024(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x1=reg256#8 +# asm 2: vmovupd 1056(x1=%ymm7 +vmovupd 1056(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x2=reg256#9 +# asm 2: vmovupd 1088(x2=%ymm8 +vmovupd 1088(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x3=reg256#10 +# asm 2: vmovupd 1120(x3=%ymm9 +vmovupd 1120(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x5=reg256#12 +# asm 2: vmovupd 1184(x5=%ymm11 +vmovupd 1184(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x6=reg256#13 +# asm 2: vmovupd 1216(x6=%ymm12 +vmovupd 1216(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x7=reg256#14 +# asm 2: vmovupd 1248(x7=%ymm13 +vmovupd 1248(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1024 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1280(x0=%ymm6 +vmovupd 1280(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x1=reg256#8 +# asm 2: vmovupd 1312(x1=%ymm7 +vmovupd 1312(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x2=reg256#9 +# asm 2: vmovupd 1344(x2=%ymm8 +vmovupd 1344(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x3=reg256#10 +# asm 2: vmovupd 1376(x3=%ymm9 +vmovupd 1376(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x4=reg256#11 +# asm 2: vmovupd 1408(x4=%ymm10 +vmovupd 1408(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x6=reg256#13 +# asm 2: vmovupd 1472(x6=%ymm12 +vmovupd 1472(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x7=reg256#14 +# asm 2: vmovupd 1504(x7=%ymm13 +vmovupd 1504(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1280 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1536(x0=%ymm6 +vmovupd 1536(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x1=reg256#8 +# asm 2: vmovupd 1568(x1=%ymm7 +vmovupd 1568(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x2=reg256#9 +# asm 2: vmovupd 1600(x2=%ymm8 +vmovupd 1600(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x3=reg256#10 +# asm 2: vmovupd 1632(x3=%ymm9 +vmovupd 1632(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x4=reg256#11 +# asm 2: vmovupd 1664(x4=%ymm10 +vmovupd 1664(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x5=reg256#12 +# asm 2: vmovupd 1696(x5=%ymm11 +vmovupd 1696(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x7=reg256#14 +# asm 2: vmovupd 1760(x7=%ymm13 +vmovupd 1760(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1536 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1792(x0=%ymm6 +vmovupd 1792(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x1=reg256#8 +# asm 2: vmovupd 1824(x1=%ymm7 +vmovupd 1824(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x2=reg256#9 +# asm 2: vmovupd 1856(x2=%ymm8 +vmovupd 1856(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x3=reg256#10 +# asm 2: vmovupd 1888(x3=%ymm9 +vmovupd 1888(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x4=reg256#11 +# asm 2: vmovupd 1920(x4=%ymm10 +vmovupd 1920(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x5=reg256#12 +# asm 2: vmovupd 1952(x5=%ymm11 +vmovupd 1952(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x6=reg256#13 +# asm 2: vmovupd 1984(x6=%ymm12 +vmovupd 1984(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#1 +# asm 2: vpand v10=%ymm0 +vpand %ymm13,%ymm0,%ymm0 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm12,%ymm0,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#13 +# asm 2: vpand v10=%ymm12 +vpand %ymm11,%ymm2,%ymm12 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm14,%ymm3,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm0,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm8,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#3 +# asm 2: vpand v10=%ymm2 +vpand %ymm1,%ymm2,%ymm2 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm8,%ymm2,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#8 +# asm 2: vpand v10=%ymm7 +vpand %ymm12,%ymm4,%ymm7 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm9,%ymm5,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#10 +# asm 2: vpand v10=%ymm9 +vpand %ymm0,%ymm4,%ymm9 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#12 +# asm 2: vpand v10=%ymm11 +vpand %ymm2,%ymm4,%ymm11 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm4,%ymm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#5 +# asm 2: vpand v10=%ymm4 +vpand %ymm1,%ymm4,%ymm4 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm10,%ymm4,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 1792 ] = x0 +# asm 1: vmovupd +#include + +void PQCLEAN_MCELIECE6960119F_AVX_uint32_sort(uint32_t *x, size_t n); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/update_asm.S b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/update_asm.S new file mode 100644 index 0000000000..e57ae7fb62 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/update_asm.S @@ -0,0 +1,576 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 s0 + +# qhasm: int64 s1 + +# qhasm: int64 s2 + +# qhasm: enter update_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6960119F_AVX_update_asm +.global PQCLEAN_MCELIECE6960119F_AVX_update_asm +_PQCLEAN_MCELIECE6960119F_AVX_update_asm: +PQCLEAN_MCELIECE6960119F_AVX_update_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: s2 = input_1 +# asm 1: mov s2=int64#2 +# asm 2: mov s2=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1, + +void PQCLEAN_MCELIECE6960119F_AVX_store_i(unsigned char *out, uint64_t in, int i) { + int j; + + for (j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE6960119F_AVX_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE6960119F_AVX_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE6960119F_AVX_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE6960119F_AVX_irr_load(vec128 *out, const unsigned char *in) { + int i, j; + uint64_t v[2]; + uint16_t irr[ SYS_T + 1 ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE6960119F_AVX_load2(in + i * 2); + irr[i] &= GFMASK; + } + + irr[ SYS_T ] = 1; + + for (i = 0; i < GFBITS; i++) { + v[0] = v[1] = 0; + + for (j = 63; j >= 0; j--) { + v[0] <<= 1; + v[0] |= (irr[j] >> i) & 1; + } + for (j = SYS_T; j >= 64; j--) { + v[1] <<= 1; + v[1] |= (irr[j] >> i) & 1; + } + + out[i] = PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(v[0], v[1]); + } +} + +void PQCLEAN_MCELIECE6960119F_AVX_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE6960119F_AVX_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +vec128 PQCLEAN_MCELIECE6960119F_AVX_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x( PQCLEAN_MCELIECE6960119F_AVX_load8(in), PQCLEAN_MCELIECE6960119F_AVX_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE6960119F_AVX_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE6960119F_AVX_store8(out + 0, PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(in, 0)); + PQCLEAN_MCELIECE6960119F_AVX_store8(out + 8, PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(in, 1)); +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/util.h b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/util.h new file mode 100644 index 0000000000..c1001a6297 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/util.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_UTIL_H +#define PQCLEAN_MCELIECE6960119F_AVX_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE6960119F_AVX_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE6960119F_AVX_store2(unsigned char *dest, uint16_t a); +uint16_t PQCLEAN_MCELIECE6960119F_AVX_load2(const unsigned char *src); +uint32_t PQCLEAN_MCELIECE6960119F_AVX_load4(const unsigned char *src); +void PQCLEAN_MCELIECE6960119F_AVX_irr_load(vec128 *out, const unsigned char *in); +void PQCLEAN_MCELIECE6960119F_AVX_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE6960119F_AVX_load8(const unsigned char *in); +vec128 PQCLEAN_MCELIECE6960119F_AVX_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE6960119F_AVX_store16(unsigned char *out, vec128 in); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/vec128.c b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/vec128.c new file mode 100644 index 0000000000..d2c9cdf2fb --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/vec128.c @@ -0,0 +1,79 @@ +#include "vec128.h" + +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE6960119F_AVX_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE6960119F_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE6960119F_AVX_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6960119F_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE6960119F_AVX_vec128_mul_asm(h, f, g, 16); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/vec128.h b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/vec128.h new file mode 100644 index 0000000000..765be1796a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/vec128.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_MCELIECE6960119F_AVX_VEC128_H +#define PQCLEAN_MCELIECE6960119F_AVX_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE6960119F_AVX_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE6960119F_AVX_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE6960119F_AVX_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE6960119F_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE6960119F_AVX_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE6960119F_AVX_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6960119F_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/vec128_mul_asm.S b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/vec128_mul_asm.S new file mode 100644 index 0000000000..8f8a0cb848 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/vec128_mul_asm.S @@ -0,0 +1,1816 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 b2 + +# qhasm: reg256 b3 + +# qhasm: reg256 b4 + +# qhasm: reg256 b5 + +# qhasm: reg256 b6 + +# qhasm: reg256 b7 + +# qhasm: reg256 b8 + +# qhasm: reg256 b9 + +# qhasm: reg256 b10 + +# qhasm: reg256 b11 + +# qhasm: reg256 b12 + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: reg128 h0 + +# qhasm: reg128 h1 + +# qhasm: reg128 h2 + +# qhasm: reg128 h3 + +# qhasm: reg128 h4 + +# qhasm: reg128 h5 + +# qhasm: reg128 h6 + +# qhasm: reg128 h7 + +# qhasm: reg128 h8 + +# qhasm: reg128 h9 + +# qhasm: reg128 h10 + +# qhasm: reg128 h11 + +# qhasm: reg128 h12 + +# qhasm: reg128 h13 + +# qhasm: reg128 h14 + +# qhasm: reg128 h15 + +# qhasm: reg128 h16 + +# qhasm: reg128 h17 + +# qhasm: reg128 h18 + +# qhasm: reg128 h19 + +# qhasm: reg128 h20 + +# qhasm: reg128 h21 + +# qhasm: reg128 h22 + +# qhasm: reg128 h23 + +# qhasm: reg128 h24 + +# qhasm: stack4864 buf + +# qhasm: int64 ptr + +# qhasm: int64 tmp + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6960119F_AVX_vec128_mul_asm +.global PQCLEAN_MCELIECE6960119F_AVX_vec128_mul_asm +_PQCLEAN_MCELIECE6960119F_AVX_vec128_mul_asm: +PQCLEAN_MCELIECE6960119F_AVX_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $608,%r11 +sub %r11,%rsp + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#5 +# asm 2: leaq ptr=%r8 +leaq 0(%rsp),%r8 + +# qhasm: tmp = input_3 +# asm 1: mov tmp=int64#6 +# asm 2: mov tmp=%r9 +mov %rcx,%r9 + +# qhasm: tmp *= 12 +# asm 1: imulq $12,tmp=int64#6 +# asm 2: imulq $12,tmp=%r9 +imulq $12,%r9,%r9 + +# qhasm: input_2 += tmp +# asm 1: add b12=reg256#1 +# asm 2: vbroadcasti128 0(b12=%ymm0 +vbroadcasti128 0(%rdx), %ymm0 + +# qhasm: input_2 -= input_3 +# asm 1: sub a6=reg256#2 +# asm 2: vpxor a6=%ymm1 +vpxor %ymm1,%ymm1,%ymm1 + +# qhasm: a6[0] = mem128[ input_1 + 96 ] +# asm 1: vinsertf128 $0x0,96(r18=reg256#3 +# asm 2: vpand r18=%ymm2 +vpand %ymm0,%ymm1,%ymm2 + +# qhasm: mem256[ ptr + 576 ] = r18 +# asm 1: vmovupd r17=reg256#4 +# asm 2: vpand r17=%ymm3 +vpand %ymm0,%ymm2,%ymm3 + +# qhasm: a4[0] = mem128[ input_1 + 64 ] +# asm 1: vinsertf128 $0x0,64(r16=reg256#6 +# asm 2: vpand r16=%ymm5 +vpand %ymm0,%ymm4,%ymm5 + +# qhasm: a3[0] = mem128[ input_1 + 48 ] +# asm 1: vinsertf128 $0x0,48(r15=reg256#8 +# asm 2: vpand r15=%ymm7 +vpand %ymm0,%ymm6,%ymm7 + +# qhasm: a2[0] = mem128[ input_1 + 32 ] +# asm 1: vinsertf128 $0x0,32(r14=reg256#10 +# asm 2: vpand r14=%ymm9 +vpand %ymm0,%ymm8,%ymm9 + +# qhasm: a1[0] = mem128[ input_1 + 16 ] +# asm 1: vinsertf128 $0x0,16(r13=reg256#12 +# asm 2: vpand r13=%ymm11 +vpand %ymm0,%ymm10,%ymm11 + +# qhasm: a0[0] = mem128[ input_1 + 0 ] +# asm 1: vinsertf128 $0x0,0(r12=reg256#1 +# asm 2: vpand r12=%ymm0 +vpand %ymm0,%ymm12,%ymm0 + +# qhasm: b11 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b11=reg256#14 +# asm 2: vbroadcasti128 0(b11=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r12 ^= r +# asm 1: vpxor r11=reg256#4 +# asm 2: vpand r11=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b10 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b10=reg256#14 +# asm 2: vbroadcasti128 0(b10=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r11 ^= r +# asm 1: vpxor r10=reg256#6 +# asm 2: vpand r10=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b9 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b9=reg256#14 +# asm 2: vbroadcasti128 0(b9=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r10 ^= r +# asm 1: vpxor r9=reg256#8 +# asm 2: vpand r9=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b8 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b8=reg256#14 +# asm 2: vbroadcasti128 0(b8=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r9 ^= r +# asm 1: vpxor r8=reg256#10 +# asm 2: vpand r8=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b7 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b7=reg256#14 +# asm 2: vbroadcasti128 0(b7=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r8 ^= r +# asm 1: vpxor r7=reg256#12 +# asm 2: vpand r7=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b6 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b6=reg256#14 +# asm 2: vbroadcasti128 0(b6=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r6=reg256#1 +# asm 2: vpand r6=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: b5 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b5=reg256#14 +# asm 2: vbroadcasti128 0(b5=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r6 ^= r +# asm 1: vpxor r5=reg256#4 +# asm 2: vpand r5=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b4 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b4=reg256#14 +# asm 2: vbroadcasti128 0(b4=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#6 +# asm 2: vpand r4=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b3 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b3=reg256#14 +# asm 2: vbroadcasti128 0(b3=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#8 +# asm 2: vpand r3=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b2 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b2=reg256#14 +# asm 2: vbroadcasti128 0(b2=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#10 +# asm 2: vpand r2=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b1 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b1=reg256#14 +# asm 2: vbroadcasti128 0(b1=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#12 +# asm 2: vpand r1=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b0 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b0=reg256#14 +# asm 2: vbroadcasti128 0(b0=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#1 +# asm 2: vpand r0=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: mem256[ ptr + 160 ] = r5 +# asm 1: vmovupd h24=reg128#1 +# asm 2: movdqu 560(h24=%xmm0 +movdqu 560(%r8),%xmm0 + +# qhasm: h11 = h24 +# asm 1: movdqa h11=reg128#2 +# asm 2: movdqa h11=%xmm1 +movdqa %xmm0,%xmm1 + +# qhasm: h12 = h24 +# asm 1: movdqa h12=reg128#3 +# asm 2: movdqa h12=%xmm2 +movdqa %xmm0,%xmm2 + +# qhasm: h14 = h24 +# asm 1: movdqa h14=reg128#4 +# asm 2: movdqa h14=%xmm3 +movdqa %xmm0,%xmm3 + +# qhasm: h15 = h24 +# asm 1: movdqa h15=reg128#1 +# asm 2: movdqa h15=%xmm0 +movdqa %xmm0,%xmm0 + +# qhasm: h23 = mem128[ ptr + 528 ] +# asm 1: movdqu 528(h23=reg128#5 +# asm 2: movdqu 528(h23=%xmm4 +movdqu 528(%r8),%xmm4 + +# qhasm: h10 = h23 +# asm 1: movdqa h10=reg128#6 +# asm 2: movdqa h10=%xmm5 +movdqa %xmm4,%xmm5 + +# qhasm: h11 = h11 ^ h23 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm4,%xmm1,%xmm1 + +# qhasm: h13 = h23 +# asm 1: movdqa h13=reg128#7 +# asm 2: movdqa h13=%xmm6 +movdqa %xmm4,%xmm6 + +# qhasm: h14 = h14 ^ h23 +# asm 1: vpxor h14=reg128#4 +# asm 2: vpxor h14=%xmm3 +vpxor %xmm4,%xmm3,%xmm3 + +# qhasm: h22 = mem128[ ptr + 496 ] +# asm 1: movdqu 496(h22=reg128#5 +# asm 2: movdqu 496(h22=%xmm4 +movdqu 496(%r8),%xmm4 + +# qhasm: h9 = h22 +# asm 1: movdqa h9=reg128#8 +# asm 2: movdqa h9=%xmm7 +movdqa %xmm4,%xmm7 + +# qhasm: h10 = h10 ^ h22 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm4,%xmm5,%xmm5 + +# qhasm: h12 = h12 ^ h22 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm4,%xmm2,%xmm2 + +# qhasm: h13 = h13 ^ h22 +# asm 1: vpxor h13=reg128#5 +# asm 2: vpxor h13=%xmm4 +vpxor %xmm4,%xmm6,%xmm4 + +# qhasm: h21 = mem128[ ptr + 464 ] +# asm 1: movdqu 464(h21=reg128#7 +# asm 2: movdqu 464(h21=%xmm6 +movdqu 464(%r8),%xmm6 + +# qhasm: h8 = h21 +# asm 1: movdqa h8=reg128#9 +# asm 2: movdqa h8=%xmm8 +movdqa %xmm6,%xmm8 + +# qhasm: h9 = h9 ^ h21 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h11 = h11 ^ h21 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h12 = h12 ^ h21 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm6,%xmm2,%xmm2 + +# qhasm: h20 = mem128[ ptr + 432 ] +# asm 1: movdqu 432(h20=reg128#7 +# asm 2: movdqu 432(h20=%xmm6 +movdqu 432(%r8),%xmm6 + +# qhasm: h7 = h20 +# asm 1: movdqa h7=reg128#10 +# asm 2: movdqa h7=%xmm9 +movdqa %xmm6,%xmm9 + +# qhasm: h8 = h8 ^ h20 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h10 = h10 ^ h20 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h11 = h11 ^ h20 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h19 = mem128[ ptr + 400 ] +# asm 1: movdqu 400(h19=reg128#7 +# asm 2: movdqu 400(h19=%xmm6 +movdqu 400(%r8),%xmm6 + +# qhasm: h6 = h19 +# asm 1: movdqa h6=reg128#11 +# asm 2: movdqa h6=%xmm10 +movdqa %xmm6,%xmm10 + +# qhasm: h7 = h7 ^ h19 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm6,%xmm9,%xmm9 + +# qhasm: h9 = h9 ^ h19 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h10 = h10 ^ h19 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h18 = mem128[ ptr + 368 ] +# asm 1: movdqu 368(h18=reg128#7 +# asm 2: movdqu 368(h18=%xmm6 +movdqu 368(%r8),%xmm6 + +# qhasm: h18 = h18 ^ mem128[ ptr + 576 ] +# asm 1: vpxor 576(h18=reg128#7 +# asm 2: vpxor 576(h18=%xmm6 +vpxor 576(%r8),%xmm6,%xmm6 + +# qhasm: h5 = h18 +# asm 1: movdqa h5=reg128#12 +# asm 2: movdqa h5=%xmm11 +movdqa %xmm6,%xmm11 + +# qhasm: h6 = h6 ^ h18 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm6,%xmm10,%xmm10 + +# qhasm: h8 = h8 ^ h18 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h9 = h9 ^ h18 +# asm 1: vpxor h9=reg128#7 +# asm 2: vpxor h9=%xmm6 +vpxor %xmm6,%xmm7,%xmm6 + +# qhasm: h17 = mem128[ ptr + 336 ] +# asm 1: movdqu 336(h17=reg128#8 +# asm 2: movdqu 336(h17=%xmm7 +movdqu 336(%r8),%xmm7 + +# qhasm: h17 = h17 ^ mem128[ ptr + 544 ] +# asm 1: vpxor 544(h17=reg128#8 +# asm 2: vpxor 544(h17=%xmm7 +vpxor 544(%r8),%xmm7,%xmm7 + +# qhasm: h4 = h17 +# asm 1: movdqa h4=reg128#13 +# asm 2: movdqa h4=%xmm12 +movdqa %xmm7,%xmm12 + +# qhasm: h5 = h5 ^ h17 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm7,%xmm11,%xmm11 + +# qhasm: h7 = h7 ^ h17 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm7,%xmm9,%xmm9 + +# qhasm: h8 = h8 ^ h17 +# asm 1: vpxor h8=reg128#8 +# asm 2: vpxor h8=%xmm7 +vpxor %xmm7,%xmm8,%xmm7 + +# qhasm: h16 = mem128[ ptr + 304 ] +# asm 1: movdqu 304(h16=reg128#9 +# asm 2: movdqu 304(h16=%xmm8 +movdqu 304(%r8),%xmm8 + +# qhasm: h16 = h16 ^ mem128[ ptr + 512 ] +# asm 1: vpxor 512(h16=reg128#9 +# asm 2: vpxor 512(h16=%xmm8 +vpxor 512(%r8),%xmm8,%xmm8 + +# qhasm: h3 = h16 +# asm 1: movdqa h3=reg128#14 +# asm 2: movdqa h3=%xmm13 +movdqa %xmm8,%xmm13 + +# qhasm: h4 = h4 ^ h16 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm8,%xmm12,%xmm12 + +# qhasm: h6 = h6 ^ h16 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm8,%xmm10,%xmm10 + +# qhasm: h7 = h7 ^ h16 +# asm 1: vpxor h7=reg128#9 +# asm 2: vpxor h7=%xmm8 +vpxor %xmm8,%xmm9,%xmm8 + +# qhasm: h15 = h15 ^ mem128[ ptr + 272 ] +# asm 1: vpxor 272(h15=reg128#1 +# asm 2: vpxor 272(h15=%xmm0 +vpxor 272(%r8),%xmm0,%xmm0 + +# qhasm: h15 = h15 ^ mem128[ ptr + 480 ] +# asm 1: vpxor 480(h15=reg128#1 +# asm 2: vpxor 480(h15=%xmm0 +vpxor 480(%r8),%xmm0,%xmm0 + +# qhasm: h2 = h15 +# asm 1: movdqa h2=reg128#10 +# asm 2: movdqa h2=%xmm9 +movdqa %xmm0,%xmm9 + +# qhasm: h3 = h3 ^ h15 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm0,%xmm13,%xmm13 + +# qhasm: h5 = h5 ^ h15 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm0,%xmm11,%xmm11 + +# qhasm: h6 = h6 ^ h15 +# asm 1: vpxor h6=reg128#1 +# asm 2: vpxor h6=%xmm0 +vpxor %xmm0,%xmm10,%xmm0 + +# qhasm: h14 = h14 ^ mem128[ ptr + 240 ] +# asm 1: vpxor 240(h14=reg128#4 +# asm 2: vpxor 240(h14=%xmm3 +vpxor 240(%r8),%xmm3,%xmm3 + +# qhasm: h14 = h14 ^ mem128[ ptr + 448 ] +# asm 1: vpxor 448(h14=reg128#4 +# asm 2: vpxor 448(h14=%xmm3 +vpxor 448(%r8),%xmm3,%xmm3 + +# qhasm: h1 = h14 +# asm 1: movdqa h1=reg128#11 +# asm 2: movdqa h1=%xmm10 +movdqa %xmm3,%xmm10 + +# qhasm: h2 = h2 ^ h14 +# asm 1: vpxor h2=reg128#10 +# asm 2: vpxor h2=%xmm9 +vpxor %xmm3,%xmm9,%xmm9 + +# qhasm: h4 = h4 ^ h14 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm3,%xmm12,%xmm12 + +# qhasm: h5 = h5 ^ h14 +# asm 1: vpxor h5=reg128#4 +# asm 2: vpxor h5=%xmm3 +vpxor %xmm3,%xmm11,%xmm3 + +# qhasm: h13 = h13 ^ mem128[ ptr + 208 ] +# asm 1: vpxor 208(h13=reg128#5 +# asm 2: vpxor 208(h13=%xmm4 +vpxor 208(%r8),%xmm4,%xmm4 + +# qhasm: h13 = h13 ^ mem128[ ptr + 416 ] +# asm 1: vpxor 416(h13=reg128#5 +# asm 2: vpxor 416(h13=%xmm4 +vpxor 416(%r8),%xmm4,%xmm4 + +# qhasm: h0 = h13 +# asm 1: movdqa h0=reg128#12 +# asm 2: movdqa h0=%xmm11 +movdqa %xmm4,%xmm11 + +# qhasm: h1 = h1 ^ h13 +# asm 1: vpxor h1=reg128#11 +# asm 2: vpxor h1=%xmm10 +vpxor %xmm4,%xmm10,%xmm10 + +# qhasm: h3 = h3 ^ h13 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm4,%xmm13,%xmm13 + +# qhasm: h4 = h4 ^ h13 +# asm 1: vpxor h4=reg128#5 +# asm 2: vpxor h4=%xmm4 +vpxor %xmm4,%xmm12,%xmm4 + +# qhasm: h12 = h12 ^ mem128[ ptr + 384 ] +# asm 1: vpxor 384(h12=reg128#3 +# asm 2: vpxor 384(h12=%xmm2 +vpxor 384(%r8),%xmm2,%xmm2 + +# qhasm: h12 = h12 ^ mem128[ ptr + 176 ] +# asm 1: vpxor 176(h12=reg128#3 +# asm 2: vpxor 176(h12=%xmm2 +vpxor 176(%r8),%xmm2,%xmm2 + +# qhasm: mem128[ input_0 + 192 ] = h12 +# asm 1: movdqu h11=reg128#2 +# asm 2: vpxor 352(h11=%xmm1 +vpxor 352(%r8),%xmm1,%xmm1 + +# qhasm: h11 = h11 ^ mem128[ ptr + 144 ] +# asm 1: vpxor 144(h11=reg128#2 +# asm 2: vpxor 144(h11=%xmm1 +vpxor 144(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 176 ] = h11 +# asm 1: movdqu h10=reg128#2 +# asm 2: vpxor 320(h10=%xmm1 +vpxor 320(%r8),%xmm5,%xmm1 + +# qhasm: h10 = h10 ^ mem128[ ptr + 112 ] +# asm 1: vpxor 112(h10=reg128#2 +# asm 2: vpxor 112(h10=%xmm1 +vpxor 112(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 160 ] = h10 +# asm 1: movdqu h9=reg128#2 +# asm 2: vpxor 288(h9=%xmm1 +vpxor 288(%r8),%xmm6,%xmm1 + +# qhasm: h9 = h9 ^ mem128[ ptr + 80 ] +# asm 1: vpxor 80(h9=reg128#2 +# asm 2: vpxor 80(h9=%xmm1 +vpxor 80(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 144 ] = h9 +# asm 1: movdqu h8=reg128#2 +# asm 2: vpxor 256(h8=%xmm1 +vpxor 256(%r8),%xmm7,%xmm1 + +# qhasm: h8 = h8 ^ mem128[ ptr + 48 ] +# asm 1: vpxor 48(h8=reg128#2 +# asm 2: vpxor 48(h8=%xmm1 +vpxor 48(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 128 ] = h8 +# asm 1: movdqu h7=reg128#2 +# asm 2: vpxor 224(h7=%xmm1 +vpxor 224(%r8),%xmm8,%xmm1 + +# qhasm: h7 = h7 ^ mem128[ ptr + 16 ] +# asm 1: vpxor 16(h7=reg128#2 +# asm 2: vpxor 16(h7=%xmm1 +vpxor 16(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = h7 +# asm 1: movdqu h6=reg128#1 +# asm 2: vpxor 192(h6=%xmm0 +vpxor 192(%r8),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 96 ] = h6 +# asm 1: movdqu h5=reg128#1 +# asm 2: vpxor 160(h5=%xmm0 +vpxor 160(%r8),%xmm3,%xmm0 + +# qhasm: mem128[ input_0 + 80 ] = h5 +# asm 1: movdqu h4=reg128#1 +# asm 2: vpxor 128(h4=%xmm0 +vpxor 128(%r8),%xmm4,%xmm0 + +# qhasm: mem128[ input_0 + 64 ] = h4 +# asm 1: movdqu h3=reg128#1 +# asm 2: vpxor 96(h3=%xmm0 +vpxor 96(%r8),%xmm13,%xmm0 + +# qhasm: mem128[ input_0 + 48 ] = h3 +# asm 1: movdqu h2=reg128#1 +# asm 2: vpxor 64(h2=%xmm0 +vpxor 64(%r8),%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 32 ] = h2 +# asm 1: movdqu h1=reg128#1 +# asm 2: vpxor 32(h1=%xmm0 +vpxor 32(%r8),%xmm10,%xmm0 + +# qhasm: mem128[ input_0 + 16 ] = h1 +# asm 1: movdqu h0=reg128#1 +# asm 2: vpxor 0(h0=%xmm0 +vpxor 0(%r8),%xmm11,%xmm0 + +# qhasm: mem128[ input_0 + 0 ] = h0 +# asm 1: movdqu + +typedef __m256i vec256; + +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_set1_16b(uint16_t a); +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_setzero(void); +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_set4x(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); + +// Extract requires a literal argument so need to be macros +#define PQCLEAN_MCELIECE6960119F_AVX_vec256_extract2x(a,i) ((vec128) _mm256_extractf128_si256((vec256) (a),(i))) +#define PQCLEAN_MCELIECE6960119F_AVX_vec256_extract(a,i) ((uint64_t) _mm256_extract_epi64((vec256) (a),(i))) + +int PQCLEAN_MCELIECE6960119F_AVX_vec256_testz(vec256 a); +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_and(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_xor(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_or(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_sll_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_srl_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_low(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_high(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_low_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_unpack_high_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE6960119F_AVX_vec256_or_reduce(const vec256 *a); +void PQCLEAN_MCELIECE6960119F_AVX_vec256_copy(vec256 *dest, const vec256 *src); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE6960119F_AVX_vec256_mul(vec256 *h, vec256 *f, const vec256 *g); +void PQCLEAN_MCELIECE6960119F_AVX_vec256_sq(vec256 * /*out*/, const vec256 * /*in*/); +void PQCLEAN_MCELIECE6960119F_AVX_vec256_inv(vec256 * /*out*/, const vec256 * /*in*/); + +extern void PQCLEAN_MCELIECE6960119F_AVX_vec256_maa_asm(vec256 *, vec256 *, const vec256 *); +extern void PQCLEAN_MCELIECE6960119F_AVX_vec256_ama_asm(vec256 *, vec256 *, const vec256 *); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/vec256_ama_asm.S b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/vec256_ama_asm.S new file mode 100644 index 0000000000..14f698fd64 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece6960119f_avx/vec256_ama_asm.S @@ -0,0 +1,2322 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 a7 + +# qhasm: reg256 a8 + +# qhasm: reg256 a9 + +# qhasm: reg256 a10 + +# qhasm: reg256 a11 + +# qhasm: reg256 a12 + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: enter vec256_ama_asm +.p2align 5 +.global _PQCLEAN_MCELIECE6960119F_AVX_vec256_ama_asm +.global PQCLEAN_MCELIECE6960119F_AVX_vec256_ama_asm +_PQCLEAN_MCELIECE6960119F_AVX_vec256_ama_asm: +PQCLEAN_MCELIECE6960119F_AVX_vec256_ama_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rdi),%ymm1 + +# qhasm: a12 = a12 ^ mem256[ input_1 + 384 ] +# asm 1: vpxor 384(a12=reg256#2 +# asm 2: vpxor 384(a12=%ymm1 +vpxor 384(%rsi),%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 384 ] = a12 +# asm 1: vmovupd r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rdi),%ymm14 + +# qhasm: a11 = a11 ^ mem256[ input_1 + 352 ] +# asm 1: vpxor 352(a11=reg256#15 +# asm 2: vpxor 352(a11=%ymm14 +vpxor 352(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 352 ] = a11 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rdi),%ymm14 + +# qhasm: a10 = a10 ^ mem256[ input_1 + 320 ] +# asm 1: vpxor 320(a10=reg256#15 +# asm 2: vpxor 320(a10=%ymm14 +vpxor 320(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 320 ] = a10 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rdi),%ymm14 + +# qhasm: a9 = a9 ^ mem256[ input_1 + 288 ] +# asm 1: vpxor 288(a9=reg256#15 +# asm 2: vpxor 288(a9=%ymm14 +vpxor 288(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 288 ] = a9 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rdi),%ymm14 + +# qhasm: a8 = a8 ^ mem256[ input_1 + 256 ] +# asm 1: vpxor 256(a8=reg256#15 +# asm 2: vpxor 256(a8=%ymm14 +vpxor 256(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 256 ] = a8 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rdi),%ymm14 + +# qhasm: a7 = a7 ^ mem256[ input_1 + 224 ] +# asm 1: vpxor 224(a7=reg256#15 +# asm 2: vpxor 224(a7=%ymm14 +vpxor 224(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 224 ] = a7 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rdi),%ymm14 + +# qhasm: a6 = a6 ^ mem256[ input_1 + 192 ] +# asm 1: vpxor 192(a6=reg256#15 +# asm 2: vpxor 192(a6=%ymm14 +vpxor 192(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 192 ] = a6 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rdi),%ymm14 + +# qhasm: a5 = a5 ^ mem256[ input_1 + 160 ] +# asm 1: vpxor 160(a5=reg256#15 +# asm 2: vpxor 160(a5=%ymm14 +vpxor 160(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 160 ] = a5 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rdi),%ymm14 + +# qhasm: a4 = a4 ^ mem256[ input_1 + 128 ] +# asm 1: vpxor 128(a4=reg256#15 +# asm 2: vpxor 128(a4=%ymm14 +vpxor 128(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 128 ] = a4 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rdi),%ymm14 + +# qhasm: a3 = a3 ^ mem256[ input_1 + 96 ] +# asm 1: vpxor 96(a3=reg256#15 +# asm 2: vpxor 96(a3=%ymm14 +vpxor 96(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 96 ] = a3 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rdi),%ymm14 + +# qhasm: a2 = a2 ^ mem256[ input_1 + 64 ] +# asm 1: vpxor 64(a2=reg256#15 +# asm 2: vpxor 64(a2=%ymm14 +vpxor 64(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 64 ] = a2 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rdi),%ymm14 + +# qhasm: a1 = a1 ^ mem256[ input_1 + 32 ] +# asm 1: vpxor 32(a1=reg256#15 +# asm 2: vpxor 32(a1=%ymm14 +vpxor 32(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 32 ] = a1 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rdi),%ymm14 + +# qhasm: a0 = a0 ^ mem256[ input_1 + 0 ] +# asm 1: vpxor 0(a0=reg256#15 +# asm 2: vpxor 0(a0=%ymm14 +vpxor 0(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 0 ] = a0 +# asm 1: vmovupd r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm2,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm1,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm13,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm12,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm11,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm10,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm9,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm8,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm7,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm6,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm5,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm4,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm3,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rdi),%ymm2,%ymm0 + +# qhasm: mem256[ input_0 + 384 ] = r12 +# asm 1: vmovupd r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rdi),%ymm1,%ymm0 + +# qhasm: mem256[ input_0 + 352 ] = r11 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rdi),%ymm13,%ymm0 + +# qhasm: mem256[ input_0 + 320 ] = r10 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rdi),%ymm12,%ymm0 + +# qhasm: mem256[ input_0 + 288 ] = r9 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rdi),%ymm11,%ymm0 + +# qhasm: mem256[ input_0 + 256 ] = r8 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rdi),%ymm10,%ymm0 + +# qhasm: mem256[ input_0 + 224 ] = r7 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rdi),%ymm9,%ymm0 + +# qhasm: mem256[ input_0 + 192 ] = r6 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rdi),%ymm8,%ymm0 + +# qhasm: mem256[ input_0 + 160 ] = r5 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rdi),%ymm7,%ymm0 + +# qhasm: mem256[ input_0 + 128 ] = r4 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rdi),%ymm6,%ymm0 + +# qhasm: mem256[ input_0 + 96 ] = r3 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rdi),%ymm5,%ymm0 + +# qhasm: mem256[ input_0 + 64 ] = r2 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rdi),%ymm4,%ymm0 + +# qhasm: mem256[ input_0 + 32 ] = r1 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rdi),%ymm3,%ymm0 + +# qhasm: mem256[ input_0 + 0 ] = r0 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t0 = mem64[ input_0 + 192 ] +# asm 1: movq 192(t0=int64#2 +# asm 2: movq 192(t0=%rsi +movq 192(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(t1=int64#3 +# asm 2: movq 200(t1=%rdx +movq 200(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 176(t0=%rsi +movq 176(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(t1=int64#3 +# asm 2: movq 184(t1=%rdx +movq 184(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 160(t0=%rsi +movq 160(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(t1=int64#3 +# asm 2: movq 168(t1=%rdx +movq 168(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 144(t0=%rsi +movq 144(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(t1=int64#3 +# asm 2: movq 152(t1=%rdx +movq 152(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 128(t0=%rsi +movq 128(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(t1=int64#3 +# asm 2: movq 136(t1=%rdx +movq 136(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 112(t0=%rsi +movq 112(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(t1=int64#3 +# asm 2: movq 120(t1=%rdx +movq 120(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 96(t0=%rsi +movq 96(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(t1=int64#3 +# asm 2: movq 104(t1=%rdx +movq 104(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 80(t0=%rsi +movq 80(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(t1=int64#3 +# asm 2: movq 88(t1=%rdx +movq 88(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 64(t0=%rsi +movq 64(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(t1=int64#3 +# asm 2: movq 72(t1=%rdx +movq 72(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 48(t0=%rsi +movq 48(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(t1=int64#3 +# asm 2: movq 56(t1=%rdx +movq 56(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 32(t0=%rsi +movq 32(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(t1=int64#3 +# asm 2: movq 40(t1=%rdx +movq 40(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 16(t0=%rsi +movq 16(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(t1=int64#3 +# asm 2: movq 24(t1=%rdx +movq 24(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 0(t0=%rsi +movq 0(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(t1=int64#1 +# asm 2: movq 8(t1=%rdi +movq 8(%rdi),%rdi + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rsi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE8192128_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/api.h b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/api.h new file mode 100644 index 0000000000..1e5cc1c161 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_API_H +#define PQCLEAN_MCELIECE8192128_AVX_API_H + +#include + +#define PQCLEAN_MCELIECE8192128_AVX_CRYPTO_ALGNAME "Classic McEliece 8192128" +#define PQCLEAN_MCELIECE8192128_AVX_CRYPTO_PUBLICKEYBYTES 1357824 +#define PQCLEAN_MCELIECE8192128_AVX_CRYPTO_SECRETKEYBYTES 14080 +#define PQCLEAN_MCELIECE8192128_AVX_CRYPTO_CIPHERTEXTBYTES 240 +#define PQCLEAN_MCELIECE8192128_AVX_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE8192128_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE8192128_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE8192128_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/benes.c b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/benes.c new file mode 100644 index 0000000000..11018b420e --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/benes.c @@ -0,0 +1,311 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_x(vec128 *data, vec128 *bits) { + int i; + vec128 v0, v1; + vec128 d; + + for (i = 0; i < 64; i += 2) { + v0 = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_low(data[i + 0], data[i + 1]); + v1 = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_high(data[i + 0], data[i + 1]); + + d = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(v0, v1); + d = PQCLEAN_MCELIECE8192128_AVX_vec128_and(d, *bits++); + v0 = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(v0, d); + v1 = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(v1, d); + + data[i + 0] = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_low(v0, v1); + data[i + 1] = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_high(v0, v1); + } +} + +static void layer_0(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x ], bs[ x + 1 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, *cond++); + bs[ x ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x ], diff); + bs[ x + 1 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 1 ], diff); + } +} + +static void layer_1(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 2 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 2 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 3 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 3 ], diff); + + cond += 2; + } +} + +static void layer_2(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 4 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 4 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 5 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 5 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[2]); + bs[ x + 2 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 2 ], diff); + bs[ x + 6 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 6 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[3]); + bs[ x + 3 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 3 ], diff); + bs[ x + 7 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ x + 7 ], diff); + + cond += 4; + } +} + +static void layer_3(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 8 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 8 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 9 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 9 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 10 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 10 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 11 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 11 ], diff); + + cond += 4; + } + } +} + +static void layer_4(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 16 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 16 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 17 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 17 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 18 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 18 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 19 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 19 ], diff); + + cond += 4; + } + } +} + +static void layer_5(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 32 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 32 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 33 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 33 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 34 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 34 ], diff); + + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); + diff = PQCLEAN_MCELIECE8192128_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 35 ] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(bs[ s + 35 ], diff); + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: bits_int, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE8192128_AVX_load_bits(vec128 bits_int[][32], const unsigned char *bits) { + int i, j; + const unsigned char *ptr = bits; + + vec128 buf[64]; + + for (i = 0; i <= 5; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(PQCLEAN_MCELIECE8192128_AVX_load8(ptr), PQCLEAN_MCELIECE8192128_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE8192128_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } + + for (i = 6; i <= 18; i++) { + for (j = 0; j < 32; j++) { + bits_int[i][j] = PQCLEAN_MCELIECE8192128_AVX_load16(ptr); + ptr += 16; + } + } + + for (i = 19; i < 25; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(PQCLEAN_MCELIECE8192128_AVX_load8(ptr), PQCLEAN_MCELIECE8192128_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE8192128_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } +} + +/* input: r, sequence of bits to be permuted */ +/* b, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE8192128_AVX_benes(vec128 *r, vec128 b[][32], int rev) { + int inc; + + vec128 *b_ptr; + + if (rev == 0) { + inc = 32; + b_ptr = b[ 0]; + } else { + inc = -32; + b_ptr = b[24]; + } + + // + + PQCLEAN_MCELIECE8192128_AVX_transpose_64x128_sp( r ); + + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE8192128_AVX_transpose_64x128_sp( r ); + + layer_x(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_x(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE8192128_AVX_transpose_64x128_sp( r ); + + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + //b_ptr += inc; + + PQCLEAN_MCELIECE8192128_AVX_transpose_64x128_sp( r ); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/benes.h b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/benes.h new file mode 100644 index 0000000000..0409a46cbc --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_BENES_H +#define PQCLEAN_MCELIECE8192128_AVX_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE8192128_AVX_load_bits(vec128 /*bits_int*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE8192128_AVX_benes(vec128 * /*r*/, vec128 /*b*/[][32], int /*rev*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/bm.c b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/bm.c new file mode 100644 index 0000000000..30172e3de1 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/bm.c @@ -0,0 +1,219 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "params.h" +#include "vec128.h" + +#include + +extern gf PQCLEAN_MCELIECE8192128_AVX_vec_reduce_asm(vec128 *); +extern void PQCLEAN_MCELIECE8192128_AVX_update_asm(void *, gf, int); + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline void vec128_cmov(vec128 out[][2], uint16_t mask) { + int i; + + vec128 v0, v1; + + vec128 m0 = PQCLEAN_MCELIECE8192128_AVX_vec128_set1_16b( mask); + vec128 m1 = PQCLEAN_MCELIECE8192128_AVX_vec128_set1_16b(~mask); + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE8192128_AVX_vec128_and(out[i][1], m0); + v1 = PQCLEAN_MCELIECE8192128_AVX_vec128_and(out[i][0], m1); + out[i][0] = PQCLEAN_MCELIECE8192128_AVX_vec128_or(v0, v1); + } +} + +static inline void interleave(vec256 *in, int idx0, int idx1, vec256 *mask, int b) { + int s = 1 << b; + + vec256 x, y; + + x = PQCLEAN_MCELIECE8192128_AVX_vec256_or(PQCLEAN_MCELIECE8192128_AVX_vec256_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE8192128_AVX_vec256_sll_4x(PQCLEAN_MCELIECE8192128_AVX_vec256_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE8192128_AVX_vec256_or(PQCLEAN_MCELIECE8192128_AVX_vec256_srl_4x(PQCLEAN_MCELIECE8192128_AVX_vec256_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE8192128_AVX_vec256_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec256 *in) { + int i, k; + + vec256 mask[4][2]; + vec256 buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE8192128_AVX_vec256_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE8192128_AVX_vec256_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE8192128_AVX_vec256_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE8192128_AVX_vec256_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE8192128_AVX_vec256_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE8192128_AVX_vec256_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE8192128_AVX_vec256_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE8192128_AVX_vec256_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE8192128_AVX_vec256_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE8192128_AVX_vec256_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE8192128_AVX_vec256_extract(buf[i], 1) >> (k * 16)) & GFMASK; + out[ (4 * 2 + k) * 16 + i ] = (PQCLEAN_MCELIECE8192128_AVX_vec256_extract(buf[i], 2) >> (k * 16)) & GFMASK; + out[ (4 * 3 + k) * 16 + i ] = (PQCLEAN_MCELIECE8192128_AVX_vec256_extract(buf[i], 3) >> (k * 16)) & GFMASK; + } + } +} + +typedef union { + vec128 as_128[GFBITS][2]; + vec256 as_256[GFBITS]; +} aligned_double_vec128; + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE8192128_AVX_bm(vec128 *out, vec256 *in) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1, t; + + vec128 prod[ GFBITS ]; + vec128 interval[GFBITS]; + + aligned_double_vec128 db; + aligned_double_vec128 BC_tmp; + aligned_double_vec128 BC; + + gf d, b, c0 = 1; + gf coefs[256]; + + // initialization + + get_coefs(coefs, in); + + BC.as_128[0][0] = PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0, one << 63); + BC.as_128[0][1] = PQCLEAN_MCELIECE8192128_AVX_vec128_setzero(); + + for (i = 1; i < GFBITS; i++) { + BC.as_128[i][0] = BC.as_128[i][1] = PQCLEAN_MCELIECE8192128_AVX_vec128_setzero(); + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[i] = PQCLEAN_MCELIECE8192128_AVX_vec128_setzero(); + } + + for (N = 0; N < 256; N++) { + PQCLEAN_MCELIECE8192128_AVX_vec128_mul_asm(prod, interval, BC.as_128[0] + 1, 32); + PQCLEAN_MCELIECE8192128_AVX_update_asm(interval, coefs[N], 16); + + d = PQCLEAN_MCELIECE8192128_AVX_vec_reduce_asm(prod); + + t = PQCLEAN_MCELIECE8192128_AVX_gf_mul2(c0, coefs[N], b); + d ^= t & 0xFFFFFFFF; + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + db.as_128[i][0] = PQCLEAN_MCELIECE8192128_AVX_vec128_setbits((d >> i) & 1); + db.as_128[i][1] = PQCLEAN_MCELIECE8192128_AVX_vec128_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE8192128_AVX_vec256_mul(BC_tmp.as_256, db.as_256, BC.as_256); + + vec128_cmov(BC.as_128, mask); + PQCLEAN_MCELIECE8192128_AVX_update_asm(BC.as_128, c0 & mask, 32); + + for (i = 0; i < GFBITS; i++) { + BC.as_128[i][1] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(BC_tmp.as_128[i][0], BC_tmp.as_128[i][1]); + } + + c0 = t >> 32; + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + c0 = PQCLEAN_MCELIECE8192128_AVX_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + prod[i] = PQCLEAN_MCELIECE8192128_AVX_vec128_setbits((c0 >> i) & 1); + } + + PQCLEAN_MCELIECE8192128_AVX_vec128_mul_asm(out, prod, BC.as_128[0] + 1, 32); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/bm.h b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/bm.h new file mode 100644 index 0000000000..f1ca018bbf --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/bm.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_BM_H +#define PQCLEAN_MCELIECE8192128_AVX_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "vec128.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE8192128_AVX_bm(vec128 * /*out*/, vec256 * /*in*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/consts.S b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/consts.S new file mode 100644 index 0000000000..e34172eab8 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/consts.S @@ -0,0 +1,33 @@ +.data + +# not supported on MacOS +#.section .rodata + +.globl PQCLEAN_MCELIECE8192128_AVX_MASK0_0 +.globl PQCLEAN_MCELIECE8192128_AVX_MASK0_1 +.globl PQCLEAN_MCELIECE8192128_AVX_MASK1_0 +.globl PQCLEAN_MCELIECE8192128_AVX_MASK1_1 +.globl PQCLEAN_MCELIECE8192128_AVX_MASK2_0 +.globl PQCLEAN_MCELIECE8192128_AVX_MASK2_1 +.globl PQCLEAN_MCELIECE8192128_AVX_MASK3_0 +.globl PQCLEAN_MCELIECE8192128_AVX_MASK3_1 +.globl PQCLEAN_MCELIECE8192128_AVX_MASK4_0 +.globl PQCLEAN_MCELIECE8192128_AVX_MASK4_1 +.globl PQCLEAN_MCELIECE8192128_AVX_MASK5_0 +.globl PQCLEAN_MCELIECE8192128_AVX_MASK5_1 + +.p2align 5 + +PQCLEAN_MCELIECE8192128_AVX_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE8192128_AVX_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE8192128_AVX_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333, 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE8192128_AVX_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE8192128_AVX_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE8192128_AVX_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE8192128_AVX_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE8192128_AVX_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE8192128_AVX_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE8192128_AVX_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE8192128_AVX_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE8192128_AVX_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/consts.inc b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/consts.inc new file mode 100644 index 0000000000..cc354957d1 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/consts.inc @@ -0,0 +1,502 @@ +//64 +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6666666666666666, 0X6666666666666666, 0X6666666666666666, 0X6666666666666666), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A) +}, +//128 +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33, 0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF) +}, +//256 +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0XF00F0FF00FF0F00F, 0XF00F0FF00FF0F00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9669966969966996, 0X6996699696699669, 0X6996699696699669, 0X9669966969966996), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0X5AA5A55AA55A5AA5, 0X5AA5A55AA55A5AA5, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555) +}, +//512 +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9966669966999966, 0X6699996699666699, 0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9966996699669966, 0X6699669966996699, 0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6699996699666699, 0X9966669966999966, 0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6699669966996699, 0X9966996699669966, 0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +//1024 +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +//2048 +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +//4096 +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/controlbits.c b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/controlbits.c new file mode 100644 index 0000000000..ec290b84e8 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE8192128_AVX_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE8192128_AVX_sort_63b(n / 2, x); + PQCLEAN_MCELIECE8192128_AVX_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE8192128_AVX_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/controlbits.h b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/controlbits.h new file mode 100644 index 0000000000..43a8a9ea5d --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_CONTROLBITS_H +#define PQCLEAN_MCELIECE8192128_AVX_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE8192128_AVX_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE8192128_AVX_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/crypto_hash.h b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/crypto_hash.h new file mode 100644 index 0000000000..23d1c16036 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE8192128_AVX_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/decrypt.c b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/decrypt.c new file mode 100644 index 0000000000..1fc3dcc465 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/decrypt.c @@ -0,0 +1,207 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec256 out[][GFBITS], vec256 inv[][GFBITS], const unsigned char *sk, vec256 *recv) { + int i, j; + + vec128 sk_int[ GFBITS ]; + vec256 eval[32][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + // computing inverses + + PQCLEAN_MCELIECE8192128_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE8192128_AVX_fft(eval, sk_int); + + for (i = 0; i < 32; i++) { + PQCLEAN_MCELIECE8192128_AVX_vec256_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE8192128_AVX_vec256_copy(inv[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE8192128_AVX_vec256_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE8192128_AVX_vec256_inv(tmp, inv[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE8192128_AVX_vec256_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE8192128_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE8192128_AVX_vec256_copy(inv[0], tmp); + + // + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE8192128_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static void scaling_inv(vec256 out[][GFBITS], vec256 inv[][GFBITS], vec256 *recv) { + int i, j; + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE8192128_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + + recv[0] = PQCLEAN_MCELIECE8192128_AVX_vec128_setbits(0); + + for (i = 1; i < 64; i++) { + recv[i] = recv[0]; + } + + for (i = 0; i < SYND_BYTES / 16; i++) { + recv[i] = PQCLEAN_MCELIECE8192128_AVX_load16(s + i * 16); + } +} + +static int weight(vec256 *v) { + int i, w = 0; + + for (i = 0; i < 32; i++) { + w += (int)_mm_popcnt_u64( PQCLEAN_MCELIECE8192128_AVX_vec256_extract(v[i], 0) ); + w += (int)_mm_popcnt_u64( PQCLEAN_MCELIECE8192128_AVX_vec256_extract(v[i], 1) ); + w += (int)_mm_popcnt_u64( PQCLEAN_MCELIECE8192128_AVX_vec256_extract(v[i], 2) ); + w += (int)_mm_popcnt_u64( PQCLEAN_MCELIECE8192128_AVX_vec256_extract(v[i], 3) ); + } + + return w; +} + +static uint16_t synd_cmp(vec256 *s0, vec256 *s1) { + int i; + vec256 diff; + + diff = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(s0[0], s1[0]); + + for (i = 1; i < GFBITS; i++) { + diff = PQCLEAN_MCELIECE8192128_AVX_vec256_or(diff, PQCLEAN_MCELIECE8192128_AVX_vec256_xor(s0[i], s1[i])); + } + + return PQCLEAN_MCELIECE8192128_AVX_vec256_testz(diff); +} + +static void reformat_128to256(vec256 *out, vec128 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE8192128_AVX_vec128_extract(in[2 * i + 0], 0); + v[1] = PQCLEAN_MCELIECE8192128_AVX_vec128_extract(in[2 * i + 0], 1); + v[2] = PQCLEAN_MCELIECE8192128_AVX_vec128_extract(in[2 * i + 1], 0); + v[3] = PQCLEAN_MCELIECE8192128_AVX_vec128_extract(in[2 * i + 1], 1); + + out[i] = PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +static void reformat_256to128(vec128 *out, vec256 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i], 3); + + out[2 * i + 0] = PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(v[0], v[1]); + out[2 * i + 1] = PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(v[2], v[3]); + } +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE8192128_AVX_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec256 inv[ 64 ][ GFBITS ]; + vec256 scaled[ 64 ][ GFBITS ]; + vec256 eval[ 64 ][ GFBITS ]; + + vec128 error128[ 64 ]; + vec256 error256[ 32 ]; + + vec256 s_priv[ GFBITS ]; + vec256 s_priv_cmp[ GFBITS ]; + vec128 locator[ GFBITS ]; + + vec128 recv128[ 64 ]; + vec256 recv256[ 32 ]; + vec256 allone; + + vec128 bits_int[25][32]; + + // Berlekamp decoder + + preprocess(recv128, c); + + PQCLEAN_MCELIECE8192128_AVX_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE8192128_AVX_benes(recv128, bits_int, 1); + reformat_128to256(recv256, recv128); + + scaling(scaled, inv, sk, recv256); // scaling + PQCLEAN_MCELIECE8192128_AVX_fft_tr(s_priv, scaled); // transposed FFT + PQCLEAN_MCELIECE8192128_AVX_bm(locator, s_priv); // Berlekamp Massey + + PQCLEAN_MCELIECE8192128_AVX_fft(eval, locator); // FFT + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE8192128_AVX_vec256_set1_16b(0xFFFF); + + for (i = 0; i < 32; i++) { + error256[i] = PQCLEAN_MCELIECE8192128_AVX_vec256_or_reduce(eval[i]); + error256[i] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(error256[i], allone); + } + + check_weight = (uint16_t)(weight(error256) ^ SYS_T); + check_weight -= 1; + check_weight >>= 15; + + scaling_inv(scaled, inv, error256); + PQCLEAN_MCELIECE8192128_AVX_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + reformat_256to128(error128, error256); + PQCLEAN_MCELIECE8192128_AVX_benes(error128, bits_int, 0); + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE8192128_AVX_store16(e + i * 16, error128[i]); + } + + return 1 - (check_synd & check_weight); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/decrypt.h b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/decrypt.h new file mode 100644 index 0000000000..ad28767e8a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_DECRYPT_H +#define PQCLEAN_MCELIECE8192128_AVX_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE8192128_AVX_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/encrypt.c b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/encrypt.c new file mode 100644 index 0000000000..d33b1eb5c9 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/encrypt.c @@ -0,0 +1,80 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "int32_sort.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE8192128_AVX_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + int i, j, eq; + + uint16_t ind[ SYS_T ]; + int32_t ind32[ SYS_T ]; + uint64_t e_int[ SYS_N / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((unsigned char *) ind, sizeof(ind)); + + for (i = 0; i < SYS_T; i++) { + ind32[i] = ind[i] &= GFMASK; + } + + // check for repetition + + PQCLEAN_MCELIECE8192128_AVX_int32_sort(ind32, SYS_T); + + eq = 0; + for (i = 1; i < SYS_T; i++) { + if (ind32[i - 1] == ind32[i]) { + eq = 1; + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind[j] & 63); + } + + for (i = 0; i < SYS_N / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < SYS_N / 64; i++) { + PQCLEAN_MCELIECE8192128_AVX_store8(e + i * 8, e_int[i]); + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE8192128_AVX_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + PQCLEAN_MCELIECE8192128_AVX_syndrome_asm(s, pk, e); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/encrypt.h b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/encrypt.h new file mode 100644 index 0000000000..3cbf3451f5 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_ENCRYPT_H +#define PQCLEAN_MCELIECE8192128_AVX_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE8192128_AVX_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/fft.c b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/fft.c new file mode 100644 index 0000000000..e39e9ad2dc --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/fft.c @@ -0,0 +1,275 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" +#include "vec256.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec128 *in) { + int i, j, k; + vec128 t; + uint64_t v0, v1; + + const vec128 mask[5][2] = { + { + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0x8888888888888888, 0x8888888888888888), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + } + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + v1 = PQCLEAN_MCELIECE8192128_AVX_vec128_extract(in[i], 1); + v1 ^= v1 >> 32; + v0 = PQCLEAN_MCELIECE8192128_AVX_vec128_extract(in[i], 0); + v0 ^= v1 << 32; + in[i] = PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(v0, v1); + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + t = PQCLEAN_MCELIECE8192128_AVX_vec128_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE8192128_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(in[i], t); + + t = PQCLEAN_MCELIECE8192128_AVX_vec128_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE8192128_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(in[i], t); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE8192128_AVX_vec128_mul(in, in, s[j]); // scaling + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec256 out[][ GFBITS ], vec128 *in) { + int i, j, k, s, b; + + vec128 tmp[ GFBITS ]; + vec256 tmp0[ GFBITS ]; + vec256 tmp1[ GFBITS ]; + vec128 t[ GFBITS ]; + + union { + vec128 v[8][ GFBITS + 1 ]; + vec256 V[8][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + uint64_t v0, v1; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + const vec256 powers[ 32 ][ GFBITS ] = { +#include "powers.inc" + }; + + uint64_t consts_ptr = 2; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; + + // boradcast + + for (j = 0; j < GFBITS; j++) { + t[j] = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_high(in[j], in[j]); + } + + for (i = 0; i < 8; i += 2) { + for (j = 0; j < GFBITS; j++) { + v0 = (beta[i + 0] >> j) & 1; + v0 = -v0; + v1 = (beta[i + 1] >> j) & 1; + v1 = -v1; + + tmp[j] = PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(v0, v1); + } + + PQCLEAN_MCELIECE8192128_AVX_vec128_mul(tmp, t, tmp); + + for (j = 0; j < GFBITS; j++) { + pre.v[i + 0][j] = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_low(tmp[j], tmp[j]); + pre.v[i + 1][j] = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_high(tmp[j], tmp[j]); + } + } + + for (i = 0; i < GFBITS; i += 2) { + if (i != GFBITS - 1) { + buf.v[0][1] = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_low(in[i + 1], in[i + 1] ^ pre.v[6][i + 1]); + } + buf.v[0][0] = PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_low(in[i + 0], in[i + 0] ^ pre.v[6][i + 0]); + + buf.V[1] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[0], pre.V[0][i / 2]); + buf.V[16] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[0], pre.V[4][i / 2]); + buf.V[3] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[1], pre.V[1][i / 2]); + buf.V[48] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[16], pre.V[5][i / 2]); + buf.V[49] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[48], pre.V[0][i / 2]); + buf.V[2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[0], pre.V[1][i / 2]); + buf.V[51] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[49], pre.V[1][i / 2]); + buf.V[6] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[2], pre.V[2][i / 2]); + buf.V[50] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[51], pre.V[0][i / 2]); + buf.V[7] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[6], pre.V[0][i / 2]); + buf.V[54] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[50], pre.V[2][i / 2]); + buf.V[5] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[7], pre.V[1][i / 2]); + buf.V[55] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[54], pre.V[0][i / 2]); + buf.V[53] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[55], pre.V[1][i / 2]); + buf.V[4] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[0], pre.V[2][i / 2]); + buf.V[52] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[53], pre.V[0][i / 2]); + buf.V[12] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[4], pre.V[3][i / 2]); + buf.V[60] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[52], pre.V[3][i / 2]); + buf.V[13] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[12], pre.V[0][i / 2]); + buf.V[61] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[60], pre.V[0][i / 2]); + buf.V[15] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[13], pre.V[1][i / 2]); + buf.V[63] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[61], pre.V[1][i / 2]); + buf.V[14] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[15], pre.V[0][i / 2]); + buf.V[62] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[63], pre.V[0][i / 2]); + buf.V[10] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[14], pre.V[2][i / 2]); + buf.V[58] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[62], pre.V[2][i / 2]); + buf.V[11] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[10], pre.V[0][i / 2]); + buf.V[59] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[58], pre.V[0][i / 2]); + buf.V[9] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[11], pre.V[1][i / 2]); + buf.V[57] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[59], pre.V[1][i / 2]); + buf.V[56] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[57], pre.V[0][i / 2]); + buf.V[8] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[0], pre.V[3][i / 2]); + buf.V[40] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[56], pre.V[4][i / 2]); + buf.V[24] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[8], pre.V[4][i / 2]); + buf.V[41] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[40], pre.V[0][i / 2]); + buf.V[25] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[24], pre.V[0][i / 2]); + buf.V[43] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[41], pre.V[1][i / 2]); + buf.V[27] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[25], pre.V[1][i / 2]); + buf.V[42] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[43], pre.V[0][i / 2]); + buf.V[26] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[27], pre.V[0][i / 2]); + buf.V[46] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[42], pre.V[2][i / 2]); + buf.V[30] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[26], pre.V[2][i / 2]); + buf.V[47] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[46], pre.V[0][i / 2]); + buf.V[31] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[30], pre.V[0][i / 2]); + buf.V[45] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[47], pre.V[1][i / 2]); + buf.V[29] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[31], pre.V[1][i / 2]); + buf.V[44] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[45], pre.V[0][i / 2]); + buf.V[28] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[29], pre.V[0][i / 2]); + buf.V[36] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[44], pre.V[3][i / 2]); + buf.V[20] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[28], pre.V[3][i / 2]); + buf.V[37] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[36], pre.V[0][i / 2]); + buf.V[21] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[20], pre.V[0][i / 2]); + buf.V[39] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[37], pre.V[1][i / 2]); + buf.V[23] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[21], pre.V[1][i / 2]); + buf.V[38] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[39], pre.V[0][i / 2]); + buf.V[22] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[23], pre.V[0][i / 2]); + buf.V[34] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[38], pre.V[2][i / 2]); + buf.V[18] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[22], pre.V[2][i / 2]); + buf.V[35] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[34], pre.V[0][i / 2]); + buf.V[19] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[18], pre.V[0][i / 2]); + buf.V[33] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[35], pre.V[1][i / 2]); + buf.V[17] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[19], pre.V[1][i / 2]); + buf.V[32] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[33], pre.V[0][i / 2]); + + + // transpose + + PQCLEAN_MCELIECE8192128_AVX_transpose_64x256_sp(buf.V); + + for (j = 0; j < 32; j++) { + if (i != GFBITS - 1) { + out[j][i + 1] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_high(buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + out[j][i + 0] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_low (buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + } + + // butterflies + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + tmp0[b] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_low (out[k][b], out[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + tmp1[b] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_high (out[k][b], out[k + 1][b]); + } + + PQCLEAN_MCELIECE8192128_AVX_vec256_maa_asm(tmp0, tmp1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_low (tmp0[b], tmp1[b]); + } + for (b = 0; b < GFBITS; b++) { + out[k + 1][b] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_high (tmp0[b], tmp1[b]); + } + } + + for (i = 0; i <= 4; i++) { + s = 1 << i; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE8192128_AVX_vec256_maa_asm(out[k], out[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + consts_ptr += (1 << i); + } + + // adding the part contributed by x^128 + + for (i = 0; i < 32; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(out[i][b], powers[i][b]); + } + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE8192128_AVX_fft(vec256 out[][GFBITS], vec128 *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/fft.h b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/fft.h new file mode 100644 index 0000000000..808773f6d6 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/fft.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_FFT_H +#define PQCLEAN_MCELIECE8192128_AVX_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include "params.h" +#include "vec128.h" +#include "vec256.h" +#include + +void PQCLEAN_MCELIECE8192128_AVX_fft(vec256 /*out*/[][GFBITS], vec128 * /*in*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/fft_tr.c b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/fft_tr.c new file mode 100644 index 0000000000..7f8e71b983 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/fft_tr.c @@ -0,0 +1,379 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec256 *in) { + int i, j, k; + vec256 t; + uint64_t v[4]; + + const vec256 mask[6][2] = { + { + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0x2222222222222222, 0x2222222222222222, 0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0x4444444444444444, 0x4444444444444444, 0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0x3030303030303030, 0x3030303030303030, 0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000) + }, + { + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF) + } + }; + + const vec256 s[6][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE8192128_AVX_vec256_mul(in, in, s[j]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + t = PQCLEAN_MCELIECE8192128_AVX_vec256_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE8192128_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(in[i], t); + + t = PQCLEAN_MCELIECE8192128_AVX_vec256_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE8192128_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(in[i], t); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i], 3); + + v[1] ^= v[0] >> 32; + v[1] ^= v[1] << 32; + v[3] ^= v[2] >> 32; + v[3] ^= v[3] << 32; + + in[i] = PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i], 3); + + v[2] ^= v[1]; + v[3] ^= v[2]; + + in[i] = PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + + } +} + +static void butterflies_tr(vec256 *out, vec256 in[][ GFBITS ]) { + int i, j, k, s, b; + + vec256 t0[ GFBITS ]; + vec256 t1[ GFBITS ]; + vec256 t; + + vec128 out128[ GFBITS ][ 2 ]; + vec128 tmp[ GFBITS ]; + + union { + vec128 v[6][ GFBITS + 1 ]; + vec256 V[6][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t v[4]; + uint64_t consts_ptr = 33; + + const unsigned char reversal[] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // butterflies + + for (i = 4; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE8192128_AVX_vec256_ama_asm(in[k], in[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + } + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_low(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_high(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE8192128_AVX_vec256_ama_asm(t0, t1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + in[k][b] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_low(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_high(t0[b], t1[b]); + } + + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_low_2x(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_high_2x(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE8192128_AVX_vec256_ama_asm(t0, t1, consts[0]); + + for (b = 0; b < GFBITS; b++) { + in[k + 0][b] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_low_2x(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_high_2x(t0[b], t1[b]); + } + } + + + // boradcast + + for (i = 0; i < GFBITS; i += 2) { + // transpose + + for (k = 0; k < 32; k++) { + if (i != GFBITS - 1) { + buf.v[ reversal[2 * k + 0] ][1] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract2x(in[ k ][i + 1], 0); + buf.v[ reversal[2 * k + 1] ][1] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract2x(in[ k ][i + 1], 1); + } + + buf.v[ reversal[2 * k + 0] ][0] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract2x(in[ k ][i + 0], 0); + buf.v[ reversal[2 * k + 1] ][0] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract2x(in[ k ][i + 0], 1); + } + + PQCLEAN_MCELIECE8192128_AVX_transpose_64x256_sp(buf.V); + + // + + pre.V[0][i / 2] = buf.V[32]; + buf.V[33] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[33], buf.V[32]); + pre.V[1][i / 2] = buf.V[33]; + buf.V[35] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[35], buf.V[33]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[35]); + buf.V[34] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[34], buf.V[35]); + pre.V[2][i / 2] = buf.V[34]; + buf.V[38] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[38], buf.V[34]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[38]); + buf.V[39] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[39], buf.V[38]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[39]); + buf.V[37] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[37], buf.V[39]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[37]); + buf.V[36] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[36], buf.V[37]); + pre.V[3][i / 2] = buf.V[36]; + buf.V[44] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[44], buf.V[36]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[44]); + buf.V[45] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[45], buf.V[44]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[45]); + buf.V[47] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[47], buf.V[45]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[47]); + buf.V[46] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[46], buf.V[47]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[46]); + buf.V[42] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[42], buf.V[46]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[42]); + buf.V[43] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[43], buf.V[42]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[43]); + buf.V[41] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[41], buf.V[43]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[41]); + buf.V[40] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[40], buf.V[41]); + pre.V[4][i / 2] = buf.V[40]; + buf.V[56] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[56], buf.V[40]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[56]); + buf.V[57] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[57], buf.V[56]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[57]); + buf.V[59] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[59], buf.V[57]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[59]); + buf.V[58] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[58], buf.V[59]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[58]); + buf.V[62] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[62], buf.V[58]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[62]); + buf.V[63] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[63], buf.V[62]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[63]); + buf.V[61] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[61], buf.V[63]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[61]); + buf.V[60] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[60], buf.V[61]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[3][i / 2], buf.V[60]); + buf.V[52] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[52], buf.V[60]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[52]); + buf.V[53] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[53], buf.V[52]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[53]); + buf.V[55] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[55], buf.V[53]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[55]); + buf.V[54] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[54], buf.V[55]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[54]); + buf.V[50] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[50], buf.V[54]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[50]); + buf.V[51] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[51], buf.V[50]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[51]); + buf.V[49] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[49], buf.V[51]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[49]); + buf.V[48] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[48], buf.V[49]); + pre.V[5][i / 2] = buf.V[48]; + buf.V[16] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[16], buf.V[48]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[16]); + buf.V[17] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[17], buf.V[16]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[17]); + buf.V[19] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[19], buf.V[17]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[19]); + buf.V[18] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[18], buf.V[19]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[18]); + buf.V[22] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[22], buf.V[18]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[22]); + buf.V[23] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[23], buf.V[22]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[23]); + buf.V[21] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[21], buf.V[23]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[21]); + buf.V[20] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[20], buf.V[21]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[3][i / 2], buf.V[20]); + buf.V[28] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[28], buf.V[20]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[28]); + buf.V[29] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[29], buf.V[28]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[29]); + buf.V[31] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[31], buf.V[29]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[31]); + buf.V[30] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[30], buf.V[31]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[30]); + buf.V[26] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[26], buf.V[30]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[26]); + buf.V[27] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[27], buf.V[26]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[27]); + buf.V[25] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[25], buf.V[27]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[25]); + buf.V[24] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[24], buf.V[25]); + pre.V[4][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[4][i / 2], buf.V[24]); + buf.V[8] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[8], buf.V[24]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[8]); + buf.V[9] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[9], buf.V[8]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[9]); + buf.V[11] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[11], buf.V[9]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[11]); + buf.V[10] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[10], buf.V[11]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[10]); + buf.V[14] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[14], buf.V[10]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[14]); + buf.V[15] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[15], buf.V[14]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[15]); + buf.V[13] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[13], buf.V[15]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[13]); + buf.V[12] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[12], buf.V[13]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[3][i / 2], buf.V[12]); + buf.V[4] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[4], buf.V[12]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[4]); + buf.V[5] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[5], buf.V[4]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[5]); + buf.V[7] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[7], buf.V[5]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[7]); + buf.V[6] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[6], buf.V[7]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[2][i / 2], buf.V[6]); + buf.V[2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[2], buf.V[6]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[2]); + buf.V[3] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[3], buf.V[2]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[1][i / 2], buf.V[3]); + buf.V[1] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[1], buf.V[3]); + + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(pre.V[0][i / 2], buf.V[1]); + t = PQCLEAN_MCELIECE8192128_AVX_vec256_xor(buf.V[0], buf.V[1]); + + if (i != GFBITS - 1) { + out128[i + 1][0] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract2x(t, 1); + } + out128[i + 0][0] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract2x(t, 0); + + } + + // + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE8192128_AVX_vec128_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE8192128_AVX_vec128_mul(tmp, pre.v[0], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = tmp[b]; + } + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE8192128_AVX_vec128_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE8192128_AVX_vec128_mul(tmp, pre.v[i], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(out128[b][1], tmp[b]); + } + } + + for (b = 0; b < GFBITS; b++) { + v[0] = PQCLEAN_MCELIECE8192128_AVX_vec128_extract(out128[b][0], 0); + v[1] = PQCLEAN_MCELIECE8192128_AVX_vec128_extract(out128[b][0], 1); + v[2] = PQCLEAN_MCELIECE8192128_AVX_vec128_extract(out128[b][1], 0); + v[3] = PQCLEAN_MCELIECE8192128_AVX_vec128_extract(out128[b][1], 1); + + out[b] = PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +void PQCLEAN_MCELIECE8192128_AVX_fft_tr(vec256 *out, vec256 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/fft_tr.h b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/fft_tr.h new file mode 100644 index 0000000000..76f999aa10 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_FFT_TR_H +#define PQCLEAN_MCELIECE8192128_AVX_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE8192128_AVX_fft_tr(vec256 * /*out*/, vec256 /*in*/[][ GFBITS ]); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/gf.c b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/gf.c new file mode 100644 index 0000000000..b69a7c1aa5 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/gf.c @@ -0,0 +1,236 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +#include + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE8192128_AVX_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x01FF000001FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x0000E0000000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & 0x00001FFF00001FFF; +} + +/* field multiplication */ +gf PQCLEAN_MCELIECE8192128_AVX_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* 2 field squarings */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE8192128_AVX_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // 11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // 1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // 11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // 111111111111 + + return gf_sqmul(out, num); // 1111111111110 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE8192128_AVX_gf_inv(gf in) { + return PQCLEAN_MCELIECE8192128_AVX_gf_frac(in, ((gf) 1)); +} + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE8192128_AVX_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE8192128_AVX_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[255]; + + for (i = 0; i < 255; i++) { + prod[i] = 0; + } + + for (i = 0; i < 128; i++) { + for (j = 0; j < 128; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE8192128_AVX_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 254; i >= 128; i--) { + prod[i - 123] ^= PQCLEAN_MCELIECE8192128_AVX_gf_mul(prod[i], (gf) 7682); + prod[i - 125] ^= PQCLEAN_MCELIECE8192128_AVX_gf_mul(prod[i], (gf) 2159); + prod[i - 128] ^= PQCLEAN_MCELIECE8192128_AVX_gf_mul(prod[i], (gf) 3597); + } + + for (i = 0; i < 128; i++) { + out[i] = prod[i]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/gf.h b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/gf.h new file mode 100644 index 0000000000..a3ccb25401 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/gf.h @@ -0,0 +1,25 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_GF_H +#define PQCLEAN_MCELIECE8192128_AVX_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include "params.h" + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE8192128_AVX_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE8192128_AVX_gf_mul(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE8192128_AVX_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE8192128_AVX_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE8192128_AVX_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE8192128_AVX_gf_mul2(gf a, gf b0, gf b1); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/int32_sort.c b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/int32_sort.c new file mode 100644 index 0000000000..0e11f1c4c0 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/int32_sort.c @@ -0,0 +1,1211 @@ +#include "int32_sort.h" +#define int32 int32_t + +#include + +typedef __m256i int32x8; +#define int32x8_load(z) _mm256_loadu_si256((__m256i_u *) (z)) +#define int32x8_store(z,i) _mm256_storeu_si256((__m256i_u *) (z),(i)) +#define int32x8_min _mm256_min_epi32 +#define int32x8_max _mm256_max_epi32 + +#define int32x8_MINMAX(a,b) \ + do { \ + int32x8 c = int32x8_min(a, b); \ + (b) = int32x8_max(a,b); \ + (a) = c; \ + } while(0) + +static inline void int32_MINMAX(int32 *a, int32 *b) { + int32 ab = *b ^ *a; + int32 c = *b - *a; + c ^= ab & (c ^ *b); + c >>= 31; + c &= ab; + *a ^= c; + *b ^= c; +} + +static void minmax_vector(int32 *x, int32 *y, size_t n) { + if (n < 8) { + while (n > 0) { + int32_MINMAX(x, y); + ++x; + ++y; + --n; + } + return; + } + if (n & 7) { + int32x8 x0 = int32x8_load(x + n - 8); + int32x8 y0 = int32x8_load(y + n - 8); + int32x8_MINMAX(x0, y0); + int32x8_store(x + n - 8, x0); + int32x8_store(y + n - 8, y0); + n &= ~7; + } + do { + int32x8 x0 = int32x8_load(x); + int32x8 y0 = int32x8_load(y); + int32x8_MINMAX(x0, y0); + int32x8_store(x, x0); + int32x8_store(y, y0); + x += 8; + y += 8; + n -= 8; + } while (n); +} + +/* stages 8,4,2,1 of size-16 bitonic merging */ +static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) { + int32x8 b0, b1, c0, c1, mask; + + int32x8_MINMAX(x0, x1); + + b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + + int32x8_MINMAX(b0, b1); + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */ + + x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */ + x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */ + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + } + + int32x8_store(&x[0], x0); + int32x8_store(&x[8], x1); +} + +/* stages 64,32 of bitonic merging; n is multiple of 128 */ +static void int32_twostages_32(int32 *x, size_t n) { + size_t i; + + while (n > 0) { + for (i = 0; i < 32; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + 32]); + int32x8 x2 = int32x8_load(&x[i + 64]); + int32x8 x3 = int32x8_load(&x[i + 96]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 32], x1); + int32x8_store(&x[i + 64], x2); + int32x8_store(&x[i + 96], x3); + } + x += 128; + n -= 128; + } +} + +/* stages 4q,2q,q of bitonic merging */ +static size_t int32_threestages(int32 *x, size_t n, size_t q) { + size_t k, i; + + for (k = 0; k + 8 * q <= n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + + return k; +} + +/* n is a power of 2; n >= 8; if n == 8 then flagdown */ +// NOLINTNEXTLINE(google-readability-function-size) +static void int32_sort_2power(int32 *x, size_t n, int flagdown) { + size_t p, q, i, j, k; + int32x8 mask; + + if (n == 8) { + int32 x0 = x[0]; + int32 x1 = x[1]; + int32 x2 = x[2]; + int32 x3 = x[3]; + int32 x4 = x[4]; + int32 x5 = x[5]; + int32 x6 = x[6]; + int32 x7 = x[7]; + + /* odd-even sort instead of bitonic sort */ + + int32_MINMAX(&x1, &x0); + int32_MINMAX(&x3, &x2); + int32_MINMAX(&x2, &x0); + int32_MINMAX(&x3, &x1); + int32_MINMAX(&x2, &x1); + + int32_MINMAX(&x5, &x4); + int32_MINMAX(&x7, &x6); + int32_MINMAX(&x6, &x4); + int32_MINMAX(&x7, &x5); + int32_MINMAX(&x6, &x5); + + int32_MINMAX(&x4, &x0); + int32_MINMAX(&x6, &x2); + int32_MINMAX(&x4, &x2); + + int32_MINMAX(&x5, &x1); + int32_MINMAX(&x7, &x3); + int32_MINMAX(&x5, &x3); + + int32_MINMAX(&x2, &x1); + int32_MINMAX(&x4, &x3); + int32_MINMAX(&x6, &x5); + + x[0] = x0; + x[1] = x1; + x[2] = x2; + x[3] = x3; + x[4] = x4; + x[5] = x5; + x[6] = x6; + x[7] = x7; + return; + } + + if (n == 16) { + int32x8 x0, x1, b0, b1, c0, c1; + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + + mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1); + + x0 ^= mask; /* A01234567 */ + x1 ^= mask; /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + c0 ^= mask; + c1 ^= mask; + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + b0 ^= mask; + b1 ^= mask; + + c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */ + c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */ + b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + mask = _mm256_set1_epi32(-1); + if (flagdown) { + x1 ^= mask; + } else { + x0 ^= mask; + } + + merge16_finish(x, x0, x1, flagdown); + return; + } + + if (n == 32) { + int32x8 x0, x1, x2, x3; + + int32_sort_2power(x, 16, 1); + int32_sort_2power(x + 16, 16, 0); + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + x2 = int32x8_load(&x[16]); + x3 = int32x8_load(&x[24]); + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + } + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + + merge16_finish(x, x0, x1, flagdown); + merge16_finish(x + 16, x2, x3, flagdown); + return; + } + + p = n >> 3; + for (i = 0; i < p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x2 = int32x8_load(&x[i + 2 * p]); + int32x8 x4 = int32x8_load(&x[i + 4 * p]); + int32x8 x6 = int32x8_load(&x[i + 6 * p]); + + /* odd-even stage instead of bitonic stage */ + + int32x8_MINMAX(x4, x0); + int32x8_MINMAX(x6, x2); + int32x8_MINMAX(x2, x0); + int32x8_MINMAX(x6, x4); + int32x8_MINMAX(x2, x4); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 2 * p], x2); + int32x8_store(&x[i + 4 * p], x4); + int32x8_store(&x[i + 6 * p], x6); + + int32x8 x1 = int32x8_load(&x[i + p]); + int32x8 x3 = int32x8_load(&x[i + 3 * p]); + int32x8 x5 = int32x8_load(&x[i + 5 * p]); + int32x8 x7 = int32x8_load(&x[i + 7 * p]); + + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x5, x3); + + int32x8_store(&x[i + p], x1); + int32x8_store(&x[i + 3 * p], x3); + int32x8_store(&x[i + 5 * p], x5); + int32x8_store(&x[i + 7 * p], x7); + } + + if (n >= 128) { + int flip, flipflip; + + mask = _mm256_set1_epi32(-1); + + for (j = 0; j < n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 16]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 16], x1); + } + + p = 8; + for (;;) { /* for p in [8, 16, ..., n/16] */ + q = p >> 1; + while (q >= 128) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + if (q == 64) { + int32_twostages_32(x, n); + q = 16; + } + if (q == 32) { + q = 8; + for (k = 0; k < n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q = 4; + } + if (q == 16) { + q = 8; + for (k = 0; k < n; k += 4 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q = 4; + } + if (q == 8) { + for (k = 0; k < n; k += q + q) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + } + } + + q = n >> 3; + flip = 0; + if (p << 1 == q) { + flip = 1; + } + flipflip = 1 - flip; + for (j = 0; j < q; j += p + p) { + for (k = j; k < j + p + p; k += p) { + for (i = k; i < k + p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + if (flip) { + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + x4 ^= mask; + x5 ^= mask; + x6 ^= mask; + x7 ^= mask; + } + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + flip ^= 1; + } + flip ^= flipflip; + } + + if (p << 4 == n) { + break; + } + p <<= 1; + } + } + + for (p = 4; p >= 1; p >>= 1) { + int32 *z = x; + int32 *target = x + n; + if (p == 4) { + mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&z[0], x0); + int32x8_store(&z[8], x1); + z += 16; + } + } else if (p == 2) { + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8_MINMAX(b0, b1); + int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20); + int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31); + int32x8_store(&z[0], c0); + int32x8_store(&z[8], c1); + z += 16; + } + } else { /* p == 1 */ + mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */ + int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */ + int32x8_MINMAX(d0, d1); + int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20); + int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31); + int32x8_store(&z[0], e0); + int32x8_store(&z[8], e1); + z += 16; + } + } + + q = n >> 4; + while (q >= 128 || q == 32) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (k = j; k < j + q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += 2 * q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (k = 0; k < q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + int32x8 x4 = int32x8_load(&x[k + 4 * q]); + int32x8 x5 = int32x8_load(&x[k + 5 * q]); + int32x8 x6 = int32x8_load(&x[k + 6 * q]); + int32x8 x7 = int32x8_load(&x[k + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + int32x8_store(&x[k + 4 * q], x4); + int32x8_store(&x[k + 5 * q], x5); + int32x8_store(&x[k + 6 * q], x6); + int32x8_store(&x[k + 7 * q], x7); + } + } + + /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */ + mask = _mm256_set1_epi32(-1); + + for (i = 0; i < n; i += 64) { + int32x8 a0 = int32x8_load(&x[i]); + int32x8 a1 = int32x8_load(&x[i + 8]); + int32x8 a2 = int32x8_load(&x[i + 16]); + int32x8 a3 = int32x8_load(&x[i + 24]); + int32x8 a4 = int32x8_load(&x[i + 32]); + int32x8 a5 = int32x8_load(&x[i + 40]); + int32x8 a6 = int32x8_load(&x[i + 48]); + int32x8 a7 = int32x8_load(&x[i + 56]); + + int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */ + int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */ + int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */ + int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */ + int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */ + int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */ + int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */ + + if (flagdown) { + c2 ^= mask; + c3 ^= mask; + c6 ^= mask; + c7 ^= mask; + } else { + c0 ^= mask; + c1 ^= mask; + c4 ^= mask; + c5 ^= mask; + } + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */ + int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */ + int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */ + + int32x8_MINMAX(d0, d1); + int32x8_MINMAX(d2, d3); + int32x8_MINMAX(d4, d5); + int32x8_MINMAX(d6, d7); + int32x8_MINMAX(d0, d2); + int32x8_MINMAX(d1, d3); + int32x8_MINMAX(d4, d6); + int32x8_MINMAX(d5, d7); + int32x8_MINMAX(d0, d4); + int32x8_MINMAX(d1, d5); + int32x8_MINMAX(d2, d6); + int32x8_MINMAX(d3, d7); + + int32x8 e0 = _mm256_unpacklo_epi32(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi32(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi32(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi32(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi32(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi32(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi32(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi32(d6, d7); + + int32x8 f0 = _mm256_unpacklo_epi64(e0, e2); + int32x8 f1 = _mm256_unpacklo_epi64(e1, e3); + int32x8 f2 = _mm256_unpackhi_epi64(e0, e2); + int32x8 f3 = _mm256_unpackhi_epi64(e1, e3); + int32x8 f4 = _mm256_unpacklo_epi64(e4, e6); + int32x8 f5 = _mm256_unpacklo_epi64(e5, e7); + int32x8 f6 = _mm256_unpackhi_epi64(e4, e6); + int32x8 f7 = _mm256_unpackhi_epi64(e5, e7); + + int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20); + int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20); + int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20); + int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20); + int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31); + int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31); + int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31); + int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31); + + int32x8_store(&x[i], g0); + int32x8_store(&x[i + 8], g1); + int32x8_store(&x[i + 16], g2); + int32x8_store(&x[i + 24], g3); + int32x8_store(&x[i + 32], g4); + int32x8_store(&x[i + 40], g5); + int32x8_store(&x[i + 48], g6); + int32x8_store(&x[i + 56], g7); + } + + q = n >> 4; + while (q >= 128 || q == 32) { + q >>= 2; + for (j = 0; j < n; j += 8 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q >>= 1; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += q + q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (i = 0; i < q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */ + int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */ + int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */ + int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */ + int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */ + int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */ + int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */ + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */ + int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */ + int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */ + + if (flagdown) { + d0 ^= mask; + d1 ^= mask; + d2 ^= mask; + d3 ^= mask; + d4 ^= mask; + d5 ^= mask; + d6 ^= mask; + d7 ^= mask; + } + + int32x8_store(&x[i], d0); + int32x8_store(&x[i + q], d4); + int32x8_store(&x[i + 2 * q], d1); + int32x8_store(&x[i + 3 * q], d5); + int32x8_store(&x[i + 4 * q], d2); + int32x8_store(&x[i + 5 * q], d6); + int32x8_store(&x[i + 6 * q], d3); + int32x8_store(&x[i + 7 * q], d7); + } +} + +void PQCLEAN_MCELIECE8192128_AVX_int32_sort(int32 *x, size_t n) { + size_t q, i, j; + + if (n <= 8) { + if (n == 8) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + int32_MINMAX(&x[6], &x[7]); + } + if (n >= 7) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + } + if (n >= 6) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + } + if (n >= 5) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + } + if (n >= 4) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + } + if (n >= 3) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + } + if (n >= 2) { + int32_MINMAX(&x[0], &x[1]); + } + return; + } + + if (!(n & (n - 1))) { + int32_sort_2power(x, n, 0); + return; + } + + q = 8; + while (q < n - q) { + q += q; + } + /* n > q >= 8 */ + + if (q <= 128) { /* n <= 256 */ + int32x8 y[32]; + for (i = q >> 3; i < q >> 2; ++i) { + y[i] = _mm256_set1_epi32(0x7fffffff); + } + for (i = 0; i < n; ++i) { + ((int32 *)y)[i] = x[i]; + } + int32_sort_2power((int32 *) y, 2 * q, 0); + for (i = 0; i < n; ++i) { + x[i] = ((int32 *) y)[i]; + } + return; + } + + int32_sort_2power(x, q, 1); + PQCLEAN_MCELIECE8192128_AVX_int32_sort(x + q, n - q); + + while (q >= 64) { + q >>= 2; + j = int32_threestages(x, n, q); + minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j); + if (j + 4 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + j += 4 * q; + } + minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j); + if (j + 2 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + } + j += 2 * q; + } + minmax_vector(x + j, x + j + q, n - q - j); + q >>= 1; + } + if (q == 32) { + j = 0; + for (; j + 64 <= n; j += 64) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8 x4 = int32x8_load(&x[j + 32]); + int32x8 x5 = int32x8_load(&x[j + 40]); + int32x8 x6 = int32x8_load(&x[j + 48]); + int32x8 x7 = int32x8_load(&x[j + 56]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20); + int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31); + int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20); + int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8_MINMAX(a4, a5); + int32x8_MINMAX(a6, a7); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20); + int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31); + int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20); + int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8 c4 = _mm256_unpacklo_epi64(b4, b5); + int32x8 c5 = _mm256_unpackhi_epi64(b4, b5); + int32x8 c6 = _mm256_unpacklo_epi64(b6, b7); + int32x8 c7 = _mm256_unpackhi_epi64(b6, b7); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8_MINMAX(c4, c5); + int32x8_MINMAX(c6, c7); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 d4 = _mm256_unpacklo_epi32(c4, c5); + int32x8 d5 = _mm256_unpackhi_epi32(c4, c5); + int32x8 d6 = _mm256_unpacklo_epi32(c6, c7); + int32x8 d7 = _mm256_unpackhi_epi32(c6, c7); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi64(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi64(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi64(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi64(d6, d7); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8_MINMAX(e4, e5); + int32x8_MINMAX(e6, e7); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8 f4 = _mm256_unpacklo_epi32(e4, e5); + int32x8 f5 = _mm256_unpackhi_epi32(e4, e5); + int32x8 f6 = _mm256_unpacklo_epi32(e6, e7); + int32x8 f7 = _mm256_unpackhi_epi32(e6, e7); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + int32x8_store(&x[j + 32], f4); + int32x8_store(&x[j + 40], f5); + int32x8_store(&x[j + 48], f6); + int32x8_store(&x[j + 56], f7); + } + minmax_vector(x + j, x + j + 32, n - 32 - j); + goto continue16; + } + if (q == 16) { + j = 0; +continue16: + for (; j + 32 <= n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + } + minmax_vector(x + j, x + j + 16, n - 16 - j); + goto continue8; + } + /* q == 8 */ + j = 0; +continue8: + for (; j + 16 <= n; j += 16) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 8], x1); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */ + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */ + int32x8_MINMAX(a0, a1); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */ + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */ + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */ + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */ + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */ + int32x8_MINMAX(e0, e1); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */ + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */ + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + } + minmax_vector(x + j, x + j + 8, n - 8 - j); + if (j + 8 <= n) { + int32_MINMAX(&x[j], &x[j + 4]); + int32_MINMAX(&x[j + 1], &x[j + 5]); + int32_MINMAX(&x[j + 2], &x[j + 6]); + int32_MINMAX(&x[j + 3], &x[j + 7]); + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + int32_MINMAX(&x[j + 4], &x[j + 6]); + int32_MINMAX(&x[j + 5], &x[j + 7]); + int32_MINMAX(&x[j + 4], &x[j + 5]); + int32_MINMAX(&x[j + 6], &x[j + 7]); + j += 8; + } + minmax_vector(x + j, x + j + 4, n - 4 - j); + if (j + 4 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + j += 4; + } + if (j + 3 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + } + if (j + 2 <= n) { + int32_MINMAX(&x[j], &x[j + 1]); + } +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/int32_sort.h b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/int32_sort.h new file mode 100644 index 0000000000..cd5240b4a8 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/int32_sort.h @@ -0,0 +1,9 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_INT32_SORT_H +#define PQCLEAN_MCELIECE8192128_AVX_INT32_SORT_H + +#include +#include + +void PQCLEAN_MCELIECE8192128_AVX_int32_sort(int32_t *x, size_t n); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/operations.c b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/operations.c new file mode 100644 index 0000000000..4cd5d3341f --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE8192128_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE8192128_AVX_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE8192128_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE8192128_AVX_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE8192128_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE8192128_AVX_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE8192128_AVX_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE8192128_AVX_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE8192128_AVX_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE8192128_AVX_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE8192128_AVX_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE8192128_AVX_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE8192128_AVX_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/params.h b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/params.h new file mode 100644 index 0000000000..5e2934b931 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_PARAMS_H +#define PQCLEAN_MCELIECE8192128_AVX_PARAMS_H + +#define GFBITS 13 +#define SYS_N 8192 +#define SYS_T 128 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/pk_gen.c b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/pk_gen.c new file mode 100644 index 0000000000..cc6c83b01e --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/pk_gen.c @@ -0,0 +1,288 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +static void de_bitslicing(uint64_t *out, vec256 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 256 + 0 * 64 + r] <<= 1; + out[i * 256 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 256 + 1 * 64 + r] <<= 1; + out[i * 256 + 1 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i][j], 2); + for (r = 0; r < 64; r++) { + out[i * 256 + 2 * 64 + r] <<= 1; + out[i * 256 + 2 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(in[i][j], 3); + for (r = 0; r < 64; r++) { + out[i * 256 + 3 * 64 + r] <<= 1; + out[i * 256 + 3 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec256 out0[][GFBITS], vec256 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[4] = {0}; + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + } +} + +#define NBLOCKS2_H ((SYS_N + 255) / 256) +#define NBLOCKS2_I ((GFBITS * SYS_T + 255) / 256) +int PQCLEAN_MCELIECE8192128_AVX_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + int i, j, k; + int row, c, d; + + uint64_t mat[ GFBITS * SYS_T ][ 128 ]; + uint64_t ops[ GFBITS * SYS_T ][ GFBITS * SYS_T / 64 ]; + + uint64_t mask; + + vec128 sk_int[ GFBITS ]; + + vec256 consts[ 32 ][ GFBITS ]; + vec256 eval[ 32 ][ GFBITS ]; + vec256 prod[ 32 ][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + uint64_t one_row[ (SYS_N - GFBITS * SYS_T) / 64 ]; + + // compute the inverses + + PQCLEAN_MCELIECE8192128_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE8192128_AVX_fft(eval, sk_int); + + PQCLEAN_MCELIECE8192128_AVX_vec256_copy(prod[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE8192128_AVX_vec256_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE8192128_AVX_vec256_inv(tmp, prod[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE8192128_AVX_vec256_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE8192128_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE8192128_AVX_vec256_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE8192128_AVX_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_I; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_I; j++) { + PQCLEAN_MCELIECE8192128_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + // gaussian elimination to obtain an upper triangular matrix + // and keep track of the operations in ops + + for (i = 0; i < (GFBITS * SYS_T) / 64; i++) { + for (j = 0; j < 64; j++) { + row = i * 64 + j; + for (c = 0; c < (GFBITS * SYS_T) / 64; c++) { + ops[ row ][ c ] = 0; + } + } + } + + for (i = 0; i < (GFBITS * SYS_T) / 64; i++) { + for (j = 0; j < 64; j++) { + row = i * 64 + j; + + ops[ row ][ i ] = 1; + ops[ row ][ i ] <<= j; + } + } + + for (i = 0; i < (GFBITS * SYS_T) / 64; i++) { + for (j = 0; j < 64; j++) { + row = i * 64 + j; + + for (k = row + 1; k < GFBITS * SYS_T; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < (GFBITS * SYS_T) / 64; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + ops[ row ][ c ] ^= ops[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = row + 1; k < GFBITS * SYS_T; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < (GFBITS * SYS_T) / 64; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + } + + // computing the lineaer map required to obatin the systematic form + + for (i = (GFBITS * SYS_T) / 64 - 1; i >= 0; i--) { + for (j = 63; j >= 0; j--) { + row = i * 64 + j; + + for (k = 0; k < row; k++) { + { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < (GFBITS * SYS_T) / 64; c++) { + ops[ k ][ c ] ^= ops[ row ][ c ] & mask; + } + } + } + } + } + + // apply the linear map to the non-systematic part + + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = NBLOCKS2_I; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE8192128_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE8192128_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + for (i = 0; i < (GFBITS * SYS_T) / 64; i++) { + for (j = 0; j < 64; j++) { + row = i * 64 + j; + + for (k = 0; k < (SYS_N - GFBITS * SYS_T) / 64; k++) { + one_row[ k ] = 0; + } + + for (c = 0; c < (GFBITS * SYS_T) / 64; c++) { + for (d = 0; d < 64; d++) { + mask = ops[ row ][ c ] >> d; + mask &= 1; + mask = -mask; + + for (k = 0; k < (SYS_N - GFBITS * SYS_T) / 64; k++) { + one_row[ k ] ^= mat[ c * 64 + d ][ k + (GFBITS * SYS_T) / 64 ] & mask; + } + } + } + + for (k = 0; k < (SYS_N - GFBITS * SYS_T) / 64; k++) { + PQCLEAN_MCELIECE8192128_AVX_store8(pk, one_row[ k ]); + pk += 8; + } + } + } + + // + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/pk_gen.h b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/pk_gen.h new file mode 100644 index 0000000000..2e30edc6e6 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_PK_GEN_H +#define PQCLEAN_MCELIECE8192128_AVX_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE8192128_AVX_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/powers.inc b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/powers.inc new file mode 100644 index 0000000000..0219933d23 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/powers.inc @@ -0,0 +1,480 @@ +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/scalars_2x.inc b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/scalars_2x.inc new file mode 100644 index 0000000000..7ea5a308b0 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/scalars_2x.inc @@ -0,0 +1,75 @@ +{ + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000) +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/scalars_4x.inc b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/scalars_4x.inc new file mode 100644 index 0000000000..57b781170b --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/scalars_4x.inc @@ -0,0 +1,91 @@ +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330, 0XF0F30C33CF03F03F, 0X3F30CC0C000F3FCC), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C, 0X00F30FC00C3300FF, 0XFC3CF030FC3FFF03), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC, 0XF3CC3CF3F3FCF33F, 0X33FFFCFF0CCF3CC3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30, 0X3C0FC0FC303C3F3C, 0X003CFF33C3CC30CF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC, 0XFC30CF303F3FF00F, 0XCFF3CF33C00F3003), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F, 0X33300C0CC3300CF3, 0X00F3CC0CF3003CCF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF, 0X3C030CF3F03FF3F3, 0X3C000CFCCC3C3333), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3, 0X3CCC03FCCC3FFC03, 0XF3CF03C0FCF03FF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030, 0X033C3C3CF0003FC3, 0X3F3C3CF0C330330C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC, 0XFFC0FF00F0FF0F03, 0X33CCFCC0FF0033F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3, 0XF3F30CF003FCC303, 0X33C300C0F0C003F3), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF, 0X30CFCFC3CC0F3000, 0X003FF0003F00C00C), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33, 0X0CF30CCF3FCFCC0F, 0XCFF3C3033F030FFF) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00, 0X0F0F00FF0FF0FFFF, 0XFF0F0F00F000F0FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF, 0XF000F0F00F00FF0F, 0X0FFFFFFFFF00000F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF, 0X000FFFF0FFF0FF0F, 0XF0FFFF000F00F0FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F, 0X00F00FFF00000FF0, 0X0F0000F00FFF0FFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFF0000F0FF000F0, 0XFF000000FFF00000, 0XFFFFF0000FFFF00F, 0X0F0F0F00FF0F000F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0, 0XFFF0FFF0000FFFF0, 0X000F0F0FFFF0F000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F, 0XF0F0F0000F0F0F00, 0XF0FFFF0F00F0FF0F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000, 0X00F000F0F00FFF00, 0X0F0F000F0F00F0FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00F000F0FFF00F00, 0X00F00F00F00F000F, 0XF0FF0F0FFF00F0FF, 0X0000F0FF00FF0F0F), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0, 0XF0FF0FFFF0F0F0FF, 0X00FFFF0FF0FFF0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF, 0X00FFFFFFFFFFFFF0, 0X0000000F00F0FFF0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF, 0X00FFF0F0FF000F0F, 0XF0F00000FF00F0F0), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00, 0X000FFFF0000FFF00, 0X0F0F0FFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00FF0000000000FF, 0X00FF00FF00FF0000, 0XFFFF00FF00FF00FF, 0XFF0000FFFFFF00FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF, 0X00FFFF000000FF00, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FF00FF0000, 0X0000FFFF000000FF, 0XFFFF00FFFFFFFF00, 0XFFFF000000FFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF000000FF0000, 0X00FFFF00FF000000, 0X0000FFFF00FFFFFF, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF, 0X00FF0000FF0000FF, 0XFFFFFF00FFFFFF00), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00, 0XFFFF0000FF00FFFF, 0X00FFFF00FFFF00FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00, 0XFF000000FFFFFF00, 0X0000FFFF00FF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF, 0X000000000000FFFF, 0X000000FFFF000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00FFFFFFFF00FF00, 0X0000FF0000000000, 0XFF00FF00FFFF0000, 0XFF00FF0000FF00FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF000000000000, 0XFF00FFFF00FFFF00, 0XFFFF00FFFF00FFFF, 0X00FF0000000000FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00FF0000FF000000, 0X00FFFF00000000FF, 0XFFFFFFFFFF00FF00, 0XFF00FFFF00FF00FF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF, 0XFFFF00FFFF0000FF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000, 0X0000FF00000000FF, 0X0000FF000000FFFF) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X000000000000FFFF, 0X0000FFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000FFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF, 0XFFFFFFFF00000000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFF000000000000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000000000000000, 0X0000FFFF00000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000, 0X00000000FFFF0000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFF0000FFFF0000, 0X0000FFFFFFFFFFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0X0000FFFFFFFFFFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFFFFFF00000000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF, 0X000000000000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF000000000000, 0X00000000FFFF0000, 0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFFFFFF0000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFF0000FFFF, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFF0000FFFFFFFF, 0X0000FFFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000) +}, +{ + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF) +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/sk_gen.c b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/sk_gen.c new file mode 100644 index 0000000000..7901b18ae7 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE8192128_AVX_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE8192128_AVX_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE8192128_AVX_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE8192128_AVX_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE8192128_AVX_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE8192128_AVX_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE8192128_AVX_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE8192128_AVX_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/sk_gen.h b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/sk_gen.h new file mode 100644 index 0000000000..d0d4c53abe --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_SK_GEN_H +#define PQCLEAN_MCELIECE8192128_AVX_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE8192128_AVX_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE8192128_AVX_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/syndrome_asm.S b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/syndrome_asm.S new file mode 100644 index 0000000000..5afc055a64 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/syndrome_asm.S @@ -0,0 +1,910 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: reg256 pp + +# qhasm: reg256 ee + +# qhasm: reg256 ss + +# qhasm: int64 buf_ptr + +# qhasm: stack256 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE8192128_AVX_syndrome_asm +.global PQCLEAN_MCELIECE8192128_AVX_syndrome_asm +_PQCLEAN_MCELIECE8192128_AVX_syndrome_asm: +PQCLEAN_MCELIECE8192128_AVX_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 1357008 +# asm 1: add $1357008,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 1664 +# asm 1: mov $1664,>row=int64#5 +# asm 2: mov $1664,>row=%r8 +mov $1664,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 208 ] +# asm 1: vmovupd 208(ee=reg256#2 +# asm 2: vmovupd 208(ee=%ymm1 +vmovupd 208(%rdx),%ymm1 + +# qhasm: ss &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 32(pp=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 240 ] +# asm 1: vmovupd 240(ee=reg256#3 +# asm 2: vmovupd 240(ee=%ymm2 +vmovupd 240(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 64(pp=%ymm1 +vmovupd 64(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 272 ] +# asm 1: vmovupd 272(ee=reg256#3 +# asm 2: vmovupd 272(ee=%ymm2 +vmovupd 272(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 96(pp=%ymm1 +vmovupd 96(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 304 ] +# asm 1: vmovupd 304(ee=reg256#3 +# asm 2: vmovupd 304(ee=%ymm2 +vmovupd 304(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 128(pp=%ymm1 +vmovupd 128(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 336 ] +# asm 1: vmovupd 336(ee=reg256#3 +# asm 2: vmovupd 336(ee=%ymm2 +vmovupd 336(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 160(pp=%ymm1 +vmovupd 160(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 368 ] +# asm 1: vmovupd 368(ee=reg256#3 +# asm 2: vmovupd 368(ee=%ymm2 +vmovupd 368(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 192(pp=%ymm1 +vmovupd 192(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 400 ] +# asm 1: vmovupd 400(ee=reg256#3 +# asm 2: vmovupd 400(ee=%ymm2 +vmovupd 400(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 224(pp=%ymm1 +vmovupd 224(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 432 ] +# asm 1: vmovupd 432(ee=reg256#3 +# asm 2: vmovupd 432(ee=%ymm2 +vmovupd 432(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 256(pp=%ymm1 +vmovupd 256(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 464 ] +# asm 1: vmovupd 464(ee=reg256#3 +# asm 2: vmovupd 464(ee=%ymm2 +vmovupd 464(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 288(pp=%ymm1 +vmovupd 288(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 496 ] +# asm 1: vmovupd 496(ee=reg256#3 +# asm 2: vmovupd 496(ee=%ymm2 +vmovupd 496(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 320(pp=%ymm1 +vmovupd 320(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 528 ] +# asm 1: vmovupd 528(ee=reg256#3 +# asm 2: vmovupd 528(ee=%ymm2 +vmovupd 528(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 352(pp=%ymm1 +vmovupd 352(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 560 ] +# asm 1: vmovupd 560(ee=reg256#3 +# asm 2: vmovupd 560(ee=%ymm2 +vmovupd 560(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 384(pp=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 592 ] +# asm 1: vmovupd 592(ee=reg256#3 +# asm 2: vmovupd 592(ee=%ymm2 +vmovupd 592(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 416(pp=%ymm1 +vmovupd 416(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 624 ] +# asm 1: vmovupd 624(ee=reg256#3 +# asm 2: vmovupd 624(ee=%ymm2 +vmovupd 624(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 448(pp=%ymm1 +vmovupd 448(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 656 ] +# asm 1: vmovupd 656(ee=reg256#3 +# asm 2: vmovupd 656(ee=%ymm2 +vmovupd 656(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 480(pp=%ymm1 +vmovupd 480(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 688 ] +# asm 1: vmovupd 688(ee=reg256#3 +# asm 2: vmovupd 688(ee=%ymm2 +vmovupd 688(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 512(pp=%ymm1 +vmovupd 512(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 720 ] +# asm 1: vmovupd 720(ee=reg256#3 +# asm 2: vmovupd 720(ee=%ymm2 +vmovupd 720(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 544(pp=%ymm1 +vmovupd 544(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 752 ] +# asm 1: vmovupd 752(ee=reg256#3 +# asm 2: vmovupd 752(ee=%ymm2 +vmovupd 752(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 576(pp=%ymm1 +vmovupd 576(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 784 ] +# asm 1: vmovupd 784(ee=reg256#3 +# asm 2: vmovupd 784(ee=%ymm2 +vmovupd 784(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 608(pp=%ymm1 +vmovupd 608(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 816 ] +# asm 1: vmovupd 816(ee=reg256#3 +# asm 2: vmovupd 816(ee=%ymm2 +vmovupd 816(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 640(pp=%ymm1 +vmovupd 640(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 848 ] +# asm 1: vmovupd 848(ee=reg256#3 +# asm 2: vmovupd 848(ee=%ymm2 +vmovupd 848(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 672(pp=%ymm1 +vmovupd 672(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 880 ] +# asm 1: vmovupd 880(ee=reg256#3 +# asm 2: vmovupd 880(ee=%ymm2 +vmovupd 880(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 704(pp=%ymm1 +vmovupd 704(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 912 ] +# asm 1: vmovupd 912(ee=reg256#3 +# asm 2: vmovupd 912(ee=%ymm2 +vmovupd 912(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 736(pp=%ymm1 +vmovupd 736(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 944 ] +# asm 1: vmovupd 944(ee=reg256#3 +# asm 2: vmovupd 944(ee=%ymm2 +vmovupd 944(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 768(pp=%ymm1 +vmovupd 768(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 976 ] +# asm 1: vmovupd 976(ee=reg256#3 +# asm 2: vmovupd 976(ee=%ymm2 +vmovupd 976(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand buf=stack256#1 +# asm 2: vmovapd buf=0(%rsp) +vmovapd %ymm0,0(%rsp) + +# qhasm: s = mem64[input_1 + 800] +# asm 1: movq 800(s=int64#6 +# asm 2: movq 800(s=%r9 +movq 800(%rsi),%r9 + +# qhasm: e = mem64[input_2 + 1008] +# asm 1: movq 1008(e=int64#7 +# asm 2: movq 1008(e=%rax +movq 1008(%rdx),%rax + +# qhasm: s &= e +# asm 1: and p=int64#7 +# asm 2: movq 808(p=%rax +movq 808(%rsi),%rax + +# qhasm: e = mem64[input_2 + 1016] +# asm 1: movq 1016(e=int64#8 +# asm 2: movq 1016(e=%r10 +movq 1016(%rdx),%r10 + +# qhasm: p &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 16(b64=%rax +movq 16(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 24(b64=%rax +movq 24(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(ee=reg256#2 +# asm 2: vmovupd 0(ee=%ymm1 +vmovupd 0(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 32(ss=%ymm0 +vmovupd 32(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(ee=reg256#2 +# asm 2: vmovupd 32(ee=%ymm1 +vmovupd 32(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 64(ss=%ymm0 +vmovupd 64(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(ee=reg256#2 +# asm 2: vmovupd 64(ee=%ymm1 +vmovupd 64(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 96(ss=%ymm0 +vmovupd 96(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 96 ] +# asm 1: vmovupd 96(ee=reg256#2 +# asm 2: vmovupd 96(ee=%ymm1 +vmovupd 96(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 128(ss=%ymm0 +vmovupd 128(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 128 ] +# asm 1: vmovupd 128(ee=reg256#2 +# asm 2: vmovupd 128(ee=%ymm1 +vmovupd 128(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 160(ss=%ymm0 +vmovupd 160(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 160 ] +# asm 1: vmovupd 160(ee=reg256#2 +# asm 2: vmovupd 160(ee=%ymm1 +vmovupd 160(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor s=int64#2 +# asm 2: movq 192(s=%rsi +movq 192(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 192 ] +# asm 1: movq 192(e=int64#4 +# asm 2: movq 192(e=%rcx +movq 192(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movq 200(s=%rsi +movq 200(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 200 ] +# asm 1: movq 200(e=int64#3 +# asm 2: movq 200(e=%rdx +movq 200(%rdx),%rdx + +# qhasm: s ^= e +# asm 1: xor mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE8192128_AVX_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE8192128_AVX_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE8192128_AVX_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE8192128_AVX_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE8192128_AVX_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE8192128_AVX_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE8192128_AVX_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE8192128_AVX_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE8192128_AVX_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE8192128_AVX_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE8192128_AVX_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE8192128_AVX_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE8192128_AVX_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE8192128_AVX_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE8192128_AVX_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE8192128_AVX_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE8192128_AVX_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE8192128_AVX_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE8192128_AVX_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE8192128_AVX_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE8192128_AVX_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE8192128_AVX_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE8192128_AVX_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK5_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK5_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE8192128_AVX_MASK5_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK5_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK5_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK5_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE8192128_AVX_MASK4_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK4_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK4_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK4_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE8192128_AVX_MASK4_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK4_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK4_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK4_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE8192128_AVX_MASK3_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK3_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK3_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK3_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE8192128_AVX_MASK3_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK3_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK3_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK3_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(x1=reg256#8 +# asm 2: vmovupd 256(x1=%ymm7 +vmovupd 256(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 512 ] +# asm 1: vmovupd 512(x2=reg256#9 +# asm 2: vmovupd 512(x2=%ymm8 +vmovupd 512(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 768 ] +# asm 1: vmovupd 768(x3=reg256#10 +# asm 2: vmovupd 768(x3=%ymm9 +vmovupd 768(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1024 ] +# asm 1: vmovupd 1024(x4=reg256#11 +# asm 2: vmovupd 1024(x4=%ymm10 +vmovupd 1024(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1280 ] +# asm 1: vmovupd 1280(x5=reg256#12 +# asm 2: vmovupd 1280(x5=%ymm11 +vmovupd 1280(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1536 ] +# asm 1: vmovupd 1536(x6=reg256#13 +# asm 2: vmovupd 1536(x6=%ymm12 +vmovupd 1536(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1792 ] +# asm 1: vmovupd 1792(x7=reg256#14 +# asm 2: vmovupd 1792(x7=%ymm13 +vmovupd 1792(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 32(x0=%ymm6 +vmovupd 32(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x2=reg256#9 +# asm 2: vmovupd 544(x2=%ymm8 +vmovupd 544(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x3=reg256#10 +# asm 2: vmovupd 800(x3=%ymm9 +vmovupd 800(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x4=reg256#11 +# asm 2: vmovupd 1056(x4=%ymm10 +vmovupd 1056(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x5=reg256#12 +# asm 2: vmovupd 1312(x5=%ymm11 +vmovupd 1312(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x6=reg256#13 +# asm 2: vmovupd 1568(x6=%ymm12 +vmovupd 1568(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x7=reg256#14 +# asm 2: vmovupd 1824(x7=%ymm13 +vmovupd 1824(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 32 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 64(x0=%ymm6 +vmovupd 64(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x1=reg256#8 +# asm 2: vmovupd 320(x1=%ymm7 +vmovupd 320(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x3=reg256#10 +# asm 2: vmovupd 832(x3=%ymm9 +vmovupd 832(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x4=reg256#11 +# asm 2: vmovupd 1088(x4=%ymm10 +vmovupd 1088(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x5=reg256#12 +# asm 2: vmovupd 1344(x5=%ymm11 +vmovupd 1344(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x6=reg256#13 +# asm 2: vmovupd 1600(x6=%ymm12 +vmovupd 1600(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x7=reg256#14 +# asm 2: vmovupd 1856(x7=%ymm13 +vmovupd 1856(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 64 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 96(x0=%ymm6 +vmovupd 96(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x1=reg256#8 +# asm 2: vmovupd 352(x1=%ymm7 +vmovupd 352(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x2=reg256#9 +# asm 2: vmovupd 608(x2=%ymm8 +vmovupd 608(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x4=reg256#11 +# asm 2: vmovupd 1120(x4=%ymm10 +vmovupd 1120(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x5=reg256#12 +# asm 2: vmovupd 1376(x5=%ymm11 +vmovupd 1376(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x6=reg256#13 +# asm 2: vmovupd 1632(x6=%ymm12 +vmovupd 1632(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x7=reg256#14 +# asm 2: vmovupd 1888(x7=%ymm13 +vmovupd 1888(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 96 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 128(x0=%ymm6 +vmovupd 128(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x1=reg256#8 +# asm 2: vmovupd 384(x1=%ymm7 +vmovupd 384(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x2=reg256#9 +# asm 2: vmovupd 640(x2=%ymm8 +vmovupd 640(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x3=reg256#10 +# asm 2: vmovupd 896(x3=%ymm9 +vmovupd 896(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x5=reg256#12 +# asm 2: vmovupd 1408(x5=%ymm11 +vmovupd 1408(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x6=reg256#13 +# asm 2: vmovupd 1664(x6=%ymm12 +vmovupd 1664(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x7=reg256#14 +# asm 2: vmovupd 1920(x7=%ymm13 +vmovupd 1920(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 128 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 160(x0=%ymm6 +vmovupd 160(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x1=reg256#8 +# asm 2: vmovupd 416(x1=%ymm7 +vmovupd 416(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x2=reg256#9 +# asm 2: vmovupd 672(x2=%ymm8 +vmovupd 672(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x3=reg256#10 +# asm 2: vmovupd 928(x3=%ymm9 +vmovupd 928(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x4=reg256#11 +# asm 2: vmovupd 1184(x4=%ymm10 +vmovupd 1184(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x6=reg256#13 +# asm 2: vmovupd 1696(x6=%ymm12 +vmovupd 1696(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x7=reg256#14 +# asm 2: vmovupd 1952(x7=%ymm13 +vmovupd 1952(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 160 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 192(x0=%ymm6 +vmovupd 192(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x1=reg256#8 +# asm 2: vmovupd 448(x1=%ymm7 +vmovupd 448(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x2=reg256#9 +# asm 2: vmovupd 704(x2=%ymm8 +vmovupd 704(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x3=reg256#10 +# asm 2: vmovupd 960(x3=%ymm9 +vmovupd 960(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x4=reg256#11 +# asm 2: vmovupd 1216(x4=%ymm10 +vmovupd 1216(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x5=reg256#12 +# asm 2: vmovupd 1472(x5=%ymm11 +vmovupd 1472(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x7=reg256#14 +# asm 2: vmovupd 1984(x7=%ymm13 +vmovupd 1984(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 192 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 224(x0=%ymm6 +vmovupd 224(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x1=reg256#8 +# asm 2: vmovupd 480(x1=%ymm7 +vmovupd 480(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x2=reg256#9 +# asm 2: vmovupd 736(x2=%ymm8 +vmovupd 736(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x3=reg256#10 +# asm 2: vmovupd 992(x3=%ymm9 +vmovupd 992(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x4=reg256#11 +# asm 2: vmovupd 1248(x4=%ymm10 +vmovupd 1248(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x5=reg256#12 +# asm 2: vmovupd 1504(x5=%ymm11 +vmovupd 1504(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x6=reg256#13 +# asm 2: vmovupd 1760(x6=%ymm12 +vmovupd 1760(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#1 +# asm 2: vpand v00=%ymm0 +vpand %ymm9,%ymm0,%ymm0 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#13 +# asm 2: vpsllq $32,v10=%ymm12 +vpsllq $32,%ymm13,%ymm12 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm0,%ymm12,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#13 +# asm 2: vpslld $16,v10=%ymm12 +vpslld $16,%ymm11,%ymm12 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#14 +# asm 2: vpsrld $16,v01=%ymm13 +vpsrld $16,%ymm14,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm0,%ymm13 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm8,%ymm13 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#3 +# asm 2: vpand v00=%ymm2 +vpand %ymm7,%ymm2,%ymm2 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#9 +# asm 2: vpslld $16,v10=%ymm8 +vpslld $16,%ymm1,%ymm8 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm2,%ymm8,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#8 +# asm 2: vpsllw $8,v10=%ymm7 +vpsllw $8,%ymm12,%ymm7 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#9 +# asm 2: vpsrlw $8,v01=%ymm8 +vpsrlw $8,%ymm9,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#10 +# asm 2: vpsllw $8,v10=%ymm9 +vpsllw $8,%ymm0,%ymm9 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#12 +# asm 2: vpsllw $8,v10=%ymm11 +vpsllw $8,%ymm2,%ymm11 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#11 +# asm 2: vpsrlw $8,v01=%ymm10 +vpsrlw $8,%ymm10,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#5 +# asm 2: vpand v00=%ymm4 +vpand %ymm6,%ymm4,%ymm4 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#11 +# asm 2: vpsllw $8,v10=%ymm10 +vpsllw $8,%ymm1,%ymm10 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm4,%ymm10,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 224 ] = x0 +# asm 1: vmovupd mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK2_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK2_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE8192128_AVX_MASK2_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK2_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK2_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK2_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE8192128_AVX_MASK1_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK1_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK1_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK1_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE8192128_AVX_MASK1_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK1_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK1_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK1_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE8192128_AVX_MASK0_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK0_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK0_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK0_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE8192128_AVX_MASK0_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK0_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK0_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE8192128_AVX_MASK0_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(x1=reg256#8 +# asm 2: vmovupd 32(x1=%ymm7 +vmovupd 32(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(x2=reg256#9 +# asm 2: vmovupd 64(x2=%ymm8 +vmovupd 64(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(x3=reg256#10 +# asm 2: vmovupd 96(x3=%ymm9 +vmovupd 96(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(x4=reg256#11 +# asm 2: vmovupd 128(x4=%ymm10 +vmovupd 128(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(x5=reg256#12 +# asm 2: vmovupd 160(x5=%ymm11 +vmovupd 160(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(x6=reg256#13 +# asm 2: vmovupd 192(x6=%ymm12 +vmovupd 192(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(x7=reg256#14 +# asm 2: vmovupd 224(x7=%ymm13 +vmovupd 224(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 256(x0=%ymm6 +vmovupd 256(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x2=reg256#9 +# asm 2: vmovupd 320(x2=%ymm8 +vmovupd 320(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x3=reg256#10 +# asm 2: vmovupd 352(x3=%ymm9 +vmovupd 352(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x4=reg256#11 +# asm 2: vmovupd 384(x4=%ymm10 +vmovupd 384(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x5=reg256#12 +# asm 2: vmovupd 416(x5=%ymm11 +vmovupd 416(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x6=reg256#13 +# asm 2: vmovupd 448(x6=%ymm12 +vmovupd 448(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x7=reg256#14 +# asm 2: vmovupd 480(x7=%ymm13 +vmovupd 480(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 256 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 512(x0=%ymm6 +vmovupd 512(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x1=reg256#8 +# asm 2: vmovupd 544(x1=%ymm7 +vmovupd 544(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x3=reg256#10 +# asm 2: vmovupd 608(x3=%ymm9 +vmovupd 608(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x4=reg256#11 +# asm 2: vmovupd 640(x4=%ymm10 +vmovupd 640(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x5=reg256#12 +# asm 2: vmovupd 672(x5=%ymm11 +vmovupd 672(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x6=reg256#13 +# asm 2: vmovupd 704(x6=%ymm12 +vmovupd 704(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x7=reg256#14 +# asm 2: vmovupd 736(x7=%ymm13 +vmovupd 736(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 512 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 768(x0=%ymm6 +vmovupd 768(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x1=reg256#8 +# asm 2: vmovupd 800(x1=%ymm7 +vmovupd 800(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x2=reg256#9 +# asm 2: vmovupd 832(x2=%ymm8 +vmovupd 832(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x4=reg256#11 +# asm 2: vmovupd 896(x4=%ymm10 +vmovupd 896(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x5=reg256#12 +# asm 2: vmovupd 928(x5=%ymm11 +vmovupd 928(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x6=reg256#13 +# asm 2: vmovupd 960(x6=%ymm12 +vmovupd 960(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x7=reg256#14 +# asm 2: vmovupd 992(x7=%ymm13 +vmovupd 992(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 768 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1024(x0=%ymm6 +vmovupd 1024(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x1=reg256#8 +# asm 2: vmovupd 1056(x1=%ymm7 +vmovupd 1056(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x2=reg256#9 +# asm 2: vmovupd 1088(x2=%ymm8 +vmovupd 1088(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x3=reg256#10 +# asm 2: vmovupd 1120(x3=%ymm9 +vmovupd 1120(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x5=reg256#12 +# asm 2: vmovupd 1184(x5=%ymm11 +vmovupd 1184(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x6=reg256#13 +# asm 2: vmovupd 1216(x6=%ymm12 +vmovupd 1216(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x7=reg256#14 +# asm 2: vmovupd 1248(x7=%ymm13 +vmovupd 1248(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1024 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1280(x0=%ymm6 +vmovupd 1280(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x1=reg256#8 +# asm 2: vmovupd 1312(x1=%ymm7 +vmovupd 1312(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x2=reg256#9 +# asm 2: vmovupd 1344(x2=%ymm8 +vmovupd 1344(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x3=reg256#10 +# asm 2: vmovupd 1376(x3=%ymm9 +vmovupd 1376(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x4=reg256#11 +# asm 2: vmovupd 1408(x4=%ymm10 +vmovupd 1408(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x6=reg256#13 +# asm 2: vmovupd 1472(x6=%ymm12 +vmovupd 1472(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x7=reg256#14 +# asm 2: vmovupd 1504(x7=%ymm13 +vmovupd 1504(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1280 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1536(x0=%ymm6 +vmovupd 1536(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x1=reg256#8 +# asm 2: vmovupd 1568(x1=%ymm7 +vmovupd 1568(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x2=reg256#9 +# asm 2: vmovupd 1600(x2=%ymm8 +vmovupd 1600(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x3=reg256#10 +# asm 2: vmovupd 1632(x3=%ymm9 +vmovupd 1632(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x4=reg256#11 +# asm 2: vmovupd 1664(x4=%ymm10 +vmovupd 1664(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x5=reg256#12 +# asm 2: vmovupd 1696(x5=%ymm11 +vmovupd 1696(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x7=reg256#14 +# asm 2: vmovupd 1760(x7=%ymm13 +vmovupd 1760(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1536 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1792(x0=%ymm6 +vmovupd 1792(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x1=reg256#8 +# asm 2: vmovupd 1824(x1=%ymm7 +vmovupd 1824(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x2=reg256#9 +# asm 2: vmovupd 1856(x2=%ymm8 +vmovupd 1856(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x3=reg256#10 +# asm 2: vmovupd 1888(x3=%ymm9 +vmovupd 1888(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x4=reg256#11 +# asm 2: vmovupd 1920(x4=%ymm10 +vmovupd 1920(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x5=reg256#12 +# asm 2: vmovupd 1952(x5=%ymm11 +vmovupd 1952(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x6=reg256#13 +# asm 2: vmovupd 1984(x6=%ymm12 +vmovupd 1984(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#1 +# asm 2: vpand v10=%ymm0 +vpand %ymm13,%ymm0,%ymm0 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm12,%ymm0,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#13 +# asm 2: vpand v10=%ymm12 +vpand %ymm11,%ymm2,%ymm12 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm14,%ymm3,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm0,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm8,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#3 +# asm 2: vpand v10=%ymm2 +vpand %ymm1,%ymm2,%ymm2 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm8,%ymm2,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#8 +# asm 2: vpand v10=%ymm7 +vpand %ymm12,%ymm4,%ymm7 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm9,%ymm5,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#10 +# asm 2: vpand v10=%ymm9 +vpand %ymm0,%ymm4,%ymm9 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#12 +# asm 2: vpand v10=%ymm11 +vpand %ymm2,%ymm4,%ymm11 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm4,%ymm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#5 +# asm 2: vpand v10=%ymm4 +vpand %ymm1,%ymm4,%ymm4 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm10,%ymm4,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 1792 ] = x0 +# asm 1: vmovupd +#include + +void PQCLEAN_MCELIECE8192128_AVX_uint32_sort(uint32_t *x, size_t n); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/update_asm.S b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/update_asm.S new file mode 100644 index 0000000000..2c930045b3 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/update_asm.S @@ -0,0 +1,576 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 s0 + +# qhasm: int64 s1 + +# qhasm: int64 s2 + +# qhasm: enter update_asm +.p2align 5 +.global _PQCLEAN_MCELIECE8192128_AVX_update_asm +.global PQCLEAN_MCELIECE8192128_AVX_update_asm +_PQCLEAN_MCELIECE8192128_AVX_update_asm: +PQCLEAN_MCELIECE8192128_AVX_update_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: s2 = input_1 +# asm 1: mov s2=int64#2 +# asm 2: mov s2=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1, + +void PQCLEAN_MCELIECE8192128_AVX_store_i(unsigned char *out, uint64_t in, int i) { + for (int j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE8192128_AVX_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE8192128_AVX_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE8192128_AVX_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE8192128_AVX_irr_load(vec128 *out, const unsigned char *in) { + int i, j; + uint64_t v0 = 0, v1 = 0; + uint16_t irr[ SYS_T ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE8192128_AVX_load2(in + i * 2); + irr[i] &= GFMASK; + } + + for (i = 0; i < GFBITS; i++) { + for (j = 63; j >= 0; j--) { + v0 <<= 1; + v1 <<= 1; + v0 |= (irr[j] >> i) & 1; + v1 |= (irr[j + 64] >> i) & 1; + } + + out[i] = PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(v0, v1); + } +} + +void PQCLEAN_MCELIECE8192128_AVX_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE8192128_AVX_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +vec128 PQCLEAN_MCELIECE8192128_AVX_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE8192128_AVX_vec128_set2x( PQCLEAN_MCELIECE8192128_AVX_load8(in), PQCLEAN_MCELIECE8192128_AVX_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE8192128_AVX_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE8192128_AVX_store8(out + 0, PQCLEAN_MCELIECE8192128_AVX_vec128_extract(in, 0)); + PQCLEAN_MCELIECE8192128_AVX_store8(out + 8, PQCLEAN_MCELIECE8192128_AVX_vec128_extract(in, 1)); +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/util.h b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/util.h new file mode 100644 index 0000000000..17b7538f83 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/util.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_UTIL_H +#define PQCLEAN_MCELIECE8192128_AVX_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE8192128_AVX_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE8192128_AVX_store2(unsigned char *dest, uint16_t a); +uint16_t PQCLEAN_MCELIECE8192128_AVX_load2(const unsigned char *src); +uint32_t PQCLEAN_MCELIECE8192128_AVX_load4(const unsigned char *src); +void PQCLEAN_MCELIECE8192128_AVX_irr_load(vec128 *out, const unsigned char *in); +void PQCLEAN_MCELIECE8192128_AVX_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE8192128_AVX_load8(const unsigned char *in); +vec128 PQCLEAN_MCELIECE8192128_AVX_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE8192128_AVX_store16(unsigned char *out, vec128 in); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/vec128.c b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/vec128.c new file mode 100644 index 0000000000..7403dde1a3 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/vec128.c @@ -0,0 +1,79 @@ +#include "vec128.h" + +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE8192128_AVX_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE8192128_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE8192128_AVX_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE8192128_AVX_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE8192128_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE8192128_AVX_vec128_mul_asm(h, f, g, 16); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/vec128.h b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/vec128.h new file mode 100644 index 0000000000..c0cc80f7f4 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/vec128.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_MCELIECE8192128_AVX_VEC128_H +#define PQCLEAN_MCELIECE8192128_AVX_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE8192128_AVX_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE8192128_AVX_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE8192128_AVX_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE8192128_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE8192128_AVX_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE8192128_AVX_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE8192128_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/vec128_mul_asm.S b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/vec128_mul_asm.S new file mode 100644 index 0000000000..d886c3da77 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/vec128_mul_asm.S @@ -0,0 +1,1816 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 b2 + +# qhasm: reg256 b3 + +# qhasm: reg256 b4 + +# qhasm: reg256 b5 + +# qhasm: reg256 b6 + +# qhasm: reg256 b7 + +# qhasm: reg256 b8 + +# qhasm: reg256 b9 + +# qhasm: reg256 b10 + +# qhasm: reg256 b11 + +# qhasm: reg256 b12 + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: reg128 h0 + +# qhasm: reg128 h1 + +# qhasm: reg128 h2 + +# qhasm: reg128 h3 + +# qhasm: reg128 h4 + +# qhasm: reg128 h5 + +# qhasm: reg128 h6 + +# qhasm: reg128 h7 + +# qhasm: reg128 h8 + +# qhasm: reg128 h9 + +# qhasm: reg128 h10 + +# qhasm: reg128 h11 + +# qhasm: reg128 h12 + +# qhasm: reg128 h13 + +# qhasm: reg128 h14 + +# qhasm: reg128 h15 + +# qhasm: reg128 h16 + +# qhasm: reg128 h17 + +# qhasm: reg128 h18 + +# qhasm: reg128 h19 + +# qhasm: reg128 h20 + +# qhasm: reg128 h21 + +# qhasm: reg128 h22 + +# qhasm: reg128 h23 + +# qhasm: reg128 h24 + +# qhasm: stack4864 buf + +# qhasm: int64 ptr + +# qhasm: int64 tmp + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE8192128_AVX_vec128_mul_asm +.global PQCLEAN_MCELIECE8192128_AVX_vec128_mul_asm +_PQCLEAN_MCELIECE8192128_AVX_vec128_mul_asm: +PQCLEAN_MCELIECE8192128_AVX_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $608,%r11 +sub %r11,%rsp + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#5 +# asm 2: leaq ptr=%r8 +leaq 0(%rsp),%r8 + +# qhasm: tmp = input_3 +# asm 1: mov tmp=int64#6 +# asm 2: mov tmp=%r9 +mov %rcx,%r9 + +# qhasm: tmp *= 12 +# asm 1: imulq $12,tmp=int64#6 +# asm 2: imulq $12,tmp=%r9 +imulq $12,%r9,%r9 + +# qhasm: input_2 += tmp +# asm 1: add b12=reg256#1 +# asm 2: vbroadcasti128 0(b12=%ymm0 +vbroadcasti128 0(%rdx), %ymm0 + +# qhasm: input_2 -= input_3 +# asm 1: sub a6=reg256#2 +# asm 2: vpxor a6=%ymm1 +vpxor %ymm1,%ymm1,%ymm1 + +# qhasm: a6[0] = mem128[ input_1 + 96 ] +# asm 1: vinsertf128 $0x0,96(r18=reg256#3 +# asm 2: vpand r18=%ymm2 +vpand %ymm0,%ymm1,%ymm2 + +# qhasm: mem256[ ptr + 576 ] = r18 +# asm 1: vmovupd r17=reg256#4 +# asm 2: vpand r17=%ymm3 +vpand %ymm0,%ymm2,%ymm3 + +# qhasm: a4[0] = mem128[ input_1 + 64 ] +# asm 1: vinsertf128 $0x0,64(r16=reg256#6 +# asm 2: vpand r16=%ymm5 +vpand %ymm0,%ymm4,%ymm5 + +# qhasm: a3[0] = mem128[ input_1 + 48 ] +# asm 1: vinsertf128 $0x0,48(r15=reg256#8 +# asm 2: vpand r15=%ymm7 +vpand %ymm0,%ymm6,%ymm7 + +# qhasm: a2[0] = mem128[ input_1 + 32 ] +# asm 1: vinsertf128 $0x0,32(r14=reg256#10 +# asm 2: vpand r14=%ymm9 +vpand %ymm0,%ymm8,%ymm9 + +# qhasm: a1[0] = mem128[ input_1 + 16 ] +# asm 1: vinsertf128 $0x0,16(r13=reg256#12 +# asm 2: vpand r13=%ymm11 +vpand %ymm0,%ymm10,%ymm11 + +# qhasm: a0[0] = mem128[ input_1 + 0 ] +# asm 1: vinsertf128 $0x0,0(r12=reg256#1 +# asm 2: vpand r12=%ymm0 +vpand %ymm0,%ymm12,%ymm0 + +# qhasm: b11 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b11=reg256#14 +# asm 2: vbroadcasti128 0(b11=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r12 ^= r +# asm 1: vpxor r11=reg256#4 +# asm 2: vpand r11=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b10 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b10=reg256#14 +# asm 2: vbroadcasti128 0(b10=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r11 ^= r +# asm 1: vpxor r10=reg256#6 +# asm 2: vpand r10=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b9 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b9=reg256#14 +# asm 2: vbroadcasti128 0(b9=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r10 ^= r +# asm 1: vpxor r9=reg256#8 +# asm 2: vpand r9=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b8 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b8=reg256#14 +# asm 2: vbroadcasti128 0(b8=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r9 ^= r +# asm 1: vpxor r8=reg256#10 +# asm 2: vpand r8=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b7 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b7=reg256#14 +# asm 2: vbroadcasti128 0(b7=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r8 ^= r +# asm 1: vpxor r7=reg256#12 +# asm 2: vpand r7=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b6 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b6=reg256#14 +# asm 2: vbroadcasti128 0(b6=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r6=reg256#1 +# asm 2: vpand r6=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: b5 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b5=reg256#14 +# asm 2: vbroadcasti128 0(b5=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r6 ^= r +# asm 1: vpxor r5=reg256#4 +# asm 2: vpand r5=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b4 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b4=reg256#14 +# asm 2: vbroadcasti128 0(b4=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#6 +# asm 2: vpand r4=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b3 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b3=reg256#14 +# asm 2: vbroadcasti128 0(b3=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#8 +# asm 2: vpand r3=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b2 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b2=reg256#14 +# asm 2: vbroadcasti128 0(b2=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#10 +# asm 2: vpand r2=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b1 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b1=reg256#14 +# asm 2: vbroadcasti128 0(b1=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#12 +# asm 2: vpand r1=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b0 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b0=reg256#14 +# asm 2: vbroadcasti128 0(b0=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#1 +# asm 2: vpand r0=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: mem256[ ptr + 160 ] = r5 +# asm 1: vmovupd h24=reg128#1 +# asm 2: movdqu 560(h24=%xmm0 +movdqu 560(%r8),%xmm0 + +# qhasm: h11 = h24 +# asm 1: movdqa h11=reg128#2 +# asm 2: movdqa h11=%xmm1 +movdqa %xmm0,%xmm1 + +# qhasm: h12 = h24 +# asm 1: movdqa h12=reg128#3 +# asm 2: movdqa h12=%xmm2 +movdqa %xmm0,%xmm2 + +# qhasm: h14 = h24 +# asm 1: movdqa h14=reg128#4 +# asm 2: movdqa h14=%xmm3 +movdqa %xmm0,%xmm3 + +# qhasm: h15 = h24 +# asm 1: movdqa h15=reg128#1 +# asm 2: movdqa h15=%xmm0 +movdqa %xmm0,%xmm0 + +# qhasm: h23 = mem128[ ptr + 528 ] +# asm 1: movdqu 528(h23=reg128#5 +# asm 2: movdqu 528(h23=%xmm4 +movdqu 528(%r8),%xmm4 + +# qhasm: h10 = h23 +# asm 1: movdqa h10=reg128#6 +# asm 2: movdqa h10=%xmm5 +movdqa %xmm4,%xmm5 + +# qhasm: h11 = h11 ^ h23 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm4,%xmm1,%xmm1 + +# qhasm: h13 = h23 +# asm 1: movdqa h13=reg128#7 +# asm 2: movdqa h13=%xmm6 +movdqa %xmm4,%xmm6 + +# qhasm: h14 = h14 ^ h23 +# asm 1: vpxor h14=reg128#4 +# asm 2: vpxor h14=%xmm3 +vpxor %xmm4,%xmm3,%xmm3 + +# qhasm: h22 = mem128[ ptr + 496 ] +# asm 1: movdqu 496(h22=reg128#5 +# asm 2: movdqu 496(h22=%xmm4 +movdqu 496(%r8),%xmm4 + +# qhasm: h9 = h22 +# asm 1: movdqa h9=reg128#8 +# asm 2: movdqa h9=%xmm7 +movdqa %xmm4,%xmm7 + +# qhasm: h10 = h10 ^ h22 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm4,%xmm5,%xmm5 + +# qhasm: h12 = h12 ^ h22 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm4,%xmm2,%xmm2 + +# qhasm: h13 = h13 ^ h22 +# asm 1: vpxor h13=reg128#5 +# asm 2: vpxor h13=%xmm4 +vpxor %xmm4,%xmm6,%xmm4 + +# qhasm: h21 = mem128[ ptr + 464 ] +# asm 1: movdqu 464(h21=reg128#7 +# asm 2: movdqu 464(h21=%xmm6 +movdqu 464(%r8),%xmm6 + +# qhasm: h8 = h21 +# asm 1: movdqa h8=reg128#9 +# asm 2: movdqa h8=%xmm8 +movdqa %xmm6,%xmm8 + +# qhasm: h9 = h9 ^ h21 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h11 = h11 ^ h21 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h12 = h12 ^ h21 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm6,%xmm2,%xmm2 + +# qhasm: h20 = mem128[ ptr + 432 ] +# asm 1: movdqu 432(h20=reg128#7 +# asm 2: movdqu 432(h20=%xmm6 +movdqu 432(%r8),%xmm6 + +# qhasm: h7 = h20 +# asm 1: movdqa h7=reg128#10 +# asm 2: movdqa h7=%xmm9 +movdqa %xmm6,%xmm9 + +# qhasm: h8 = h8 ^ h20 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h10 = h10 ^ h20 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h11 = h11 ^ h20 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h19 = mem128[ ptr + 400 ] +# asm 1: movdqu 400(h19=reg128#7 +# asm 2: movdqu 400(h19=%xmm6 +movdqu 400(%r8),%xmm6 + +# qhasm: h6 = h19 +# asm 1: movdqa h6=reg128#11 +# asm 2: movdqa h6=%xmm10 +movdqa %xmm6,%xmm10 + +# qhasm: h7 = h7 ^ h19 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm6,%xmm9,%xmm9 + +# qhasm: h9 = h9 ^ h19 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h10 = h10 ^ h19 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h18 = mem128[ ptr + 368 ] +# asm 1: movdqu 368(h18=reg128#7 +# asm 2: movdqu 368(h18=%xmm6 +movdqu 368(%r8),%xmm6 + +# qhasm: h18 = h18 ^ mem128[ ptr + 576 ] +# asm 1: vpxor 576(h18=reg128#7 +# asm 2: vpxor 576(h18=%xmm6 +vpxor 576(%r8),%xmm6,%xmm6 + +# qhasm: h5 = h18 +# asm 1: movdqa h5=reg128#12 +# asm 2: movdqa h5=%xmm11 +movdqa %xmm6,%xmm11 + +# qhasm: h6 = h6 ^ h18 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm6,%xmm10,%xmm10 + +# qhasm: h8 = h8 ^ h18 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h9 = h9 ^ h18 +# asm 1: vpxor h9=reg128#7 +# asm 2: vpxor h9=%xmm6 +vpxor %xmm6,%xmm7,%xmm6 + +# qhasm: h17 = mem128[ ptr + 336 ] +# asm 1: movdqu 336(h17=reg128#8 +# asm 2: movdqu 336(h17=%xmm7 +movdqu 336(%r8),%xmm7 + +# qhasm: h17 = h17 ^ mem128[ ptr + 544 ] +# asm 1: vpxor 544(h17=reg128#8 +# asm 2: vpxor 544(h17=%xmm7 +vpxor 544(%r8),%xmm7,%xmm7 + +# qhasm: h4 = h17 +# asm 1: movdqa h4=reg128#13 +# asm 2: movdqa h4=%xmm12 +movdqa %xmm7,%xmm12 + +# qhasm: h5 = h5 ^ h17 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm7,%xmm11,%xmm11 + +# qhasm: h7 = h7 ^ h17 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm7,%xmm9,%xmm9 + +# qhasm: h8 = h8 ^ h17 +# asm 1: vpxor h8=reg128#8 +# asm 2: vpxor h8=%xmm7 +vpxor %xmm7,%xmm8,%xmm7 + +# qhasm: h16 = mem128[ ptr + 304 ] +# asm 1: movdqu 304(h16=reg128#9 +# asm 2: movdqu 304(h16=%xmm8 +movdqu 304(%r8),%xmm8 + +# qhasm: h16 = h16 ^ mem128[ ptr + 512 ] +# asm 1: vpxor 512(h16=reg128#9 +# asm 2: vpxor 512(h16=%xmm8 +vpxor 512(%r8),%xmm8,%xmm8 + +# qhasm: h3 = h16 +# asm 1: movdqa h3=reg128#14 +# asm 2: movdqa h3=%xmm13 +movdqa %xmm8,%xmm13 + +# qhasm: h4 = h4 ^ h16 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm8,%xmm12,%xmm12 + +# qhasm: h6 = h6 ^ h16 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm8,%xmm10,%xmm10 + +# qhasm: h7 = h7 ^ h16 +# asm 1: vpxor h7=reg128#9 +# asm 2: vpxor h7=%xmm8 +vpxor %xmm8,%xmm9,%xmm8 + +# qhasm: h15 = h15 ^ mem128[ ptr + 272 ] +# asm 1: vpxor 272(h15=reg128#1 +# asm 2: vpxor 272(h15=%xmm0 +vpxor 272(%r8),%xmm0,%xmm0 + +# qhasm: h15 = h15 ^ mem128[ ptr + 480 ] +# asm 1: vpxor 480(h15=reg128#1 +# asm 2: vpxor 480(h15=%xmm0 +vpxor 480(%r8),%xmm0,%xmm0 + +# qhasm: h2 = h15 +# asm 1: movdqa h2=reg128#10 +# asm 2: movdqa h2=%xmm9 +movdqa %xmm0,%xmm9 + +# qhasm: h3 = h3 ^ h15 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm0,%xmm13,%xmm13 + +# qhasm: h5 = h5 ^ h15 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm0,%xmm11,%xmm11 + +# qhasm: h6 = h6 ^ h15 +# asm 1: vpxor h6=reg128#1 +# asm 2: vpxor h6=%xmm0 +vpxor %xmm0,%xmm10,%xmm0 + +# qhasm: h14 = h14 ^ mem128[ ptr + 240 ] +# asm 1: vpxor 240(h14=reg128#4 +# asm 2: vpxor 240(h14=%xmm3 +vpxor 240(%r8),%xmm3,%xmm3 + +# qhasm: h14 = h14 ^ mem128[ ptr + 448 ] +# asm 1: vpxor 448(h14=reg128#4 +# asm 2: vpxor 448(h14=%xmm3 +vpxor 448(%r8),%xmm3,%xmm3 + +# qhasm: h1 = h14 +# asm 1: movdqa h1=reg128#11 +# asm 2: movdqa h1=%xmm10 +movdqa %xmm3,%xmm10 + +# qhasm: h2 = h2 ^ h14 +# asm 1: vpxor h2=reg128#10 +# asm 2: vpxor h2=%xmm9 +vpxor %xmm3,%xmm9,%xmm9 + +# qhasm: h4 = h4 ^ h14 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm3,%xmm12,%xmm12 + +# qhasm: h5 = h5 ^ h14 +# asm 1: vpxor h5=reg128#4 +# asm 2: vpxor h5=%xmm3 +vpxor %xmm3,%xmm11,%xmm3 + +# qhasm: h13 = h13 ^ mem128[ ptr + 208 ] +# asm 1: vpxor 208(h13=reg128#5 +# asm 2: vpxor 208(h13=%xmm4 +vpxor 208(%r8),%xmm4,%xmm4 + +# qhasm: h13 = h13 ^ mem128[ ptr + 416 ] +# asm 1: vpxor 416(h13=reg128#5 +# asm 2: vpxor 416(h13=%xmm4 +vpxor 416(%r8),%xmm4,%xmm4 + +# qhasm: h0 = h13 +# asm 1: movdqa h0=reg128#12 +# asm 2: movdqa h0=%xmm11 +movdqa %xmm4,%xmm11 + +# qhasm: h1 = h1 ^ h13 +# asm 1: vpxor h1=reg128#11 +# asm 2: vpxor h1=%xmm10 +vpxor %xmm4,%xmm10,%xmm10 + +# qhasm: h3 = h3 ^ h13 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm4,%xmm13,%xmm13 + +# qhasm: h4 = h4 ^ h13 +# asm 1: vpxor h4=reg128#5 +# asm 2: vpxor h4=%xmm4 +vpxor %xmm4,%xmm12,%xmm4 + +# qhasm: h12 = h12 ^ mem128[ ptr + 384 ] +# asm 1: vpxor 384(h12=reg128#3 +# asm 2: vpxor 384(h12=%xmm2 +vpxor 384(%r8),%xmm2,%xmm2 + +# qhasm: h12 = h12 ^ mem128[ ptr + 176 ] +# asm 1: vpxor 176(h12=reg128#3 +# asm 2: vpxor 176(h12=%xmm2 +vpxor 176(%r8),%xmm2,%xmm2 + +# qhasm: mem128[ input_0 + 192 ] = h12 +# asm 1: movdqu h11=reg128#2 +# asm 2: vpxor 352(h11=%xmm1 +vpxor 352(%r8),%xmm1,%xmm1 + +# qhasm: h11 = h11 ^ mem128[ ptr + 144 ] +# asm 1: vpxor 144(h11=reg128#2 +# asm 2: vpxor 144(h11=%xmm1 +vpxor 144(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 176 ] = h11 +# asm 1: movdqu h10=reg128#2 +# asm 2: vpxor 320(h10=%xmm1 +vpxor 320(%r8),%xmm5,%xmm1 + +# qhasm: h10 = h10 ^ mem128[ ptr + 112 ] +# asm 1: vpxor 112(h10=reg128#2 +# asm 2: vpxor 112(h10=%xmm1 +vpxor 112(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 160 ] = h10 +# asm 1: movdqu h9=reg128#2 +# asm 2: vpxor 288(h9=%xmm1 +vpxor 288(%r8),%xmm6,%xmm1 + +# qhasm: h9 = h9 ^ mem128[ ptr + 80 ] +# asm 1: vpxor 80(h9=reg128#2 +# asm 2: vpxor 80(h9=%xmm1 +vpxor 80(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 144 ] = h9 +# asm 1: movdqu h8=reg128#2 +# asm 2: vpxor 256(h8=%xmm1 +vpxor 256(%r8),%xmm7,%xmm1 + +# qhasm: h8 = h8 ^ mem128[ ptr + 48 ] +# asm 1: vpxor 48(h8=reg128#2 +# asm 2: vpxor 48(h8=%xmm1 +vpxor 48(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 128 ] = h8 +# asm 1: movdqu h7=reg128#2 +# asm 2: vpxor 224(h7=%xmm1 +vpxor 224(%r8),%xmm8,%xmm1 + +# qhasm: h7 = h7 ^ mem128[ ptr + 16 ] +# asm 1: vpxor 16(h7=reg128#2 +# asm 2: vpxor 16(h7=%xmm1 +vpxor 16(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = h7 +# asm 1: movdqu h6=reg128#1 +# asm 2: vpxor 192(h6=%xmm0 +vpxor 192(%r8),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 96 ] = h6 +# asm 1: movdqu h5=reg128#1 +# asm 2: vpxor 160(h5=%xmm0 +vpxor 160(%r8),%xmm3,%xmm0 + +# qhasm: mem128[ input_0 + 80 ] = h5 +# asm 1: movdqu h4=reg128#1 +# asm 2: vpxor 128(h4=%xmm0 +vpxor 128(%r8),%xmm4,%xmm0 + +# qhasm: mem128[ input_0 + 64 ] = h4 +# asm 1: movdqu h3=reg128#1 +# asm 2: vpxor 96(h3=%xmm0 +vpxor 96(%r8),%xmm13,%xmm0 + +# qhasm: mem128[ input_0 + 48 ] = h3 +# asm 1: movdqu h2=reg128#1 +# asm 2: vpxor 64(h2=%xmm0 +vpxor 64(%r8),%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 32 ] = h2 +# asm 1: movdqu h1=reg128#1 +# asm 2: vpxor 32(h1=%xmm0 +vpxor 32(%r8),%xmm10,%xmm0 + +# qhasm: mem128[ input_0 + 16 ] = h1 +# asm 1: movdqu h0=reg128#1 +# asm 2: vpxor 0(h0=%xmm0 +vpxor 0(%r8),%xmm11,%xmm0 + +# qhasm: mem128[ input_0 + 0 ] = h0 +# asm 1: movdqu + +typedef __m256i vec256; + +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_set1_16b(uint16_t a); +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_setzero(void); +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_set4x(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); + +// Extract requires a literal argument so need to be macros +#define PQCLEAN_MCELIECE8192128_AVX_vec256_extract2x(a,i) ((vec128) _mm256_extractf128_si256((vec256) (a),(i))) +#define PQCLEAN_MCELIECE8192128_AVX_vec256_extract(a,i) ((uint64_t) _mm256_extract_epi64((vec256) (a),(i))) + +int PQCLEAN_MCELIECE8192128_AVX_vec256_testz(vec256 a); +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_and(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_xor(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_or(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_sll_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_srl_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_low(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_high(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_low_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_unpack_high_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128_AVX_vec256_or_reduce(const vec256 *a); +void PQCLEAN_MCELIECE8192128_AVX_vec256_copy(vec256 *dest, const vec256 *src); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE8192128_AVX_vec256_mul(vec256 *h, vec256 *f, const vec256 *g); +void PQCLEAN_MCELIECE8192128_AVX_vec256_sq(vec256 * /*out*/, const vec256 * /*in*/); +void PQCLEAN_MCELIECE8192128_AVX_vec256_inv(vec256 * /*out*/, const vec256 * /*in*/); + +extern void PQCLEAN_MCELIECE8192128_AVX_vec256_maa_asm(vec256 *, vec256 *, const vec256 *); +extern void PQCLEAN_MCELIECE8192128_AVX_vec256_ama_asm(vec256 *, vec256 *, const vec256 *); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/vec256_ama_asm.S b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/vec256_ama_asm.S new file mode 100644 index 0000000000..3a6f75879a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128_avx/vec256_ama_asm.S @@ -0,0 +1,2322 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 a7 + +# qhasm: reg256 a8 + +# qhasm: reg256 a9 + +# qhasm: reg256 a10 + +# qhasm: reg256 a11 + +# qhasm: reg256 a12 + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: enter vec256_ama_asm +.p2align 5 +.global _PQCLEAN_MCELIECE8192128_AVX_vec256_ama_asm +.global PQCLEAN_MCELIECE8192128_AVX_vec256_ama_asm +_PQCLEAN_MCELIECE8192128_AVX_vec256_ama_asm: +PQCLEAN_MCELIECE8192128_AVX_vec256_ama_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rdi),%ymm1 + +# qhasm: a12 = a12 ^ mem256[ input_1 + 384 ] +# asm 1: vpxor 384(a12=reg256#2 +# asm 2: vpxor 384(a12=%ymm1 +vpxor 384(%rsi),%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 384 ] = a12 +# asm 1: vmovupd r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rdi),%ymm14 + +# qhasm: a11 = a11 ^ mem256[ input_1 + 352 ] +# asm 1: vpxor 352(a11=reg256#15 +# asm 2: vpxor 352(a11=%ymm14 +vpxor 352(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 352 ] = a11 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rdi),%ymm14 + +# qhasm: a10 = a10 ^ mem256[ input_1 + 320 ] +# asm 1: vpxor 320(a10=reg256#15 +# asm 2: vpxor 320(a10=%ymm14 +vpxor 320(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 320 ] = a10 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rdi),%ymm14 + +# qhasm: a9 = a9 ^ mem256[ input_1 + 288 ] +# asm 1: vpxor 288(a9=reg256#15 +# asm 2: vpxor 288(a9=%ymm14 +vpxor 288(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 288 ] = a9 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rdi),%ymm14 + +# qhasm: a8 = a8 ^ mem256[ input_1 + 256 ] +# asm 1: vpxor 256(a8=reg256#15 +# asm 2: vpxor 256(a8=%ymm14 +vpxor 256(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 256 ] = a8 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rdi),%ymm14 + +# qhasm: a7 = a7 ^ mem256[ input_1 + 224 ] +# asm 1: vpxor 224(a7=reg256#15 +# asm 2: vpxor 224(a7=%ymm14 +vpxor 224(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 224 ] = a7 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rdi),%ymm14 + +# qhasm: a6 = a6 ^ mem256[ input_1 + 192 ] +# asm 1: vpxor 192(a6=reg256#15 +# asm 2: vpxor 192(a6=%ymm14 +vpxor 192(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 192 ] = a6 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rdi),%ymm14 + +# qhasm: a5 = a5 ^ mem256[ input_1 + 160 ] +# asm 1: vpxor 160(a5=reg256#15 +# asm 2: vpxor 160(a5=%ymm14 +vpxor 160(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 160 ] = a5 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rdi),%ymm14 + +# qhasm: a4 = a4 ^ mem256[ input_1 + 128 ] +# asm 1: vpxor 128(a4=reg256#15 +# asm 2: vpxor 128(a4=%ymm14 +vpxor 128(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 128 ] = a4 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rdi),%ymm14 + +# qhasm: a3 = a3 ^ mem256[ input_1 + 96 ] +# asm 1: vpxor 96(a3=reg256#15 +# asm 2: vpxor 96(a3=%ymm14 +vpxor 96(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 96 ] = a3 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rdi),%ymm14 + +# qhasm: a2 = a2 ^ mem256[ input_1 + 64 ] +# asm 1: vpxor 64(a2=reg256#15 +# asm 2: vpxor 64(a2=%ymm14 +vpxor 64(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 64 ] = a2 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rdi),%ymm14 + +# qhasm: a1 = a1 ^ mem256[ input_1 + 32 ] +# asm 1: vpxor 32(a1=reg256#15 +# asm 2: vpxor 32(a1=%ymm14 +vpxor 32(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 32 ] = a1 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rdi),%ymm14 + +# qhasm: a0 = a0 ^ mem256[ input_1 + 0 ] +# asm 1: vpxor 0(a0=reg256#15 +# asm 2: vpxor 0(a0=%ymm14 +vpxor 0(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 0 ] = a0 +# asm 1: vmovupd r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm2,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm1,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm13,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm12,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm11,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm10,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm9,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm8,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm7,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm6,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm5,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm4,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm3,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rdi),%ymm2,%ymm0 + +# qhasm: mem256[ input_0 + 384 ] = r12 +# asm 1: vmovupd r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rdi),%ymm1,%ymm0 + +# qhasm: mem256[ input_0 + 352 ] = r11 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rdi),%ymm13,%ymm0 + +# qhasm: mem256[ input_0 + 320 ] = r10 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rdi),%ymm12,%ymm0 + +# qhasm: mem256[ input_0 + 288 ] = r9 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rdi),%ymm11,%ymm0 + +# qhasm: mem256[ input_0 + 256 ] = r8 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rdi),%ymm10,%ymm0 + +# qhasm: mem256[ input_0 + 224 ] = r7 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rdi),%ymm9,%ymm0 + +# qhasm: mem256[ input_0 + 192 ] = r6 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rdi),%ymm8,%ymm0 + +# qhasm: mem256[ input_0 + 160 ] = r5 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rdi),%ymm7,%ymm0 + +# qhasm: mem256[ input_0 + 128 ] = r4 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rdi),%ymm6,%ymm0 + +# qhasm: mem256[ input_0 + 96 ] = r3 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rdi),%ymm5,%ymm0 + +# qhasm: mem256[ input_0 + 64 ] = r2 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rdi),%ymm4,%ymm0 + +# qhasm: mem256[ input_0 + 32 ] = r1 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rdi),%ymm3,%ymm0 + +# qhasm: mem256[ input_0 + 0 ] = r0 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t0 = mem64[ input_0 + 192 ] +# asm 1: movq 192(t0=int64#2 +# asm 2: movq 192(t0=%rsi +movq 192(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(t1=int64#3 +# asm 2: movq 200(t1=%rdx +movq 200(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 176(t0=%rsi +movq 176(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(t1=int64#3 +# asm 2: movq 184(t1=%rdx +movq 184(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 160(t0=%rsi +movq 160(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(t1=int64#3 +# asm 2: movq 168(t1=%rdx +movq 168(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 144(t0=%rsi +movq 144(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(t1=int64#3 +# asm 2: movq 152(t1=%rdx +movq 152(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 128(t0=%rsi +movq 128(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(t1=int64#3 +# asm 2: movq 136(t1=%rdx +movq 136(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 112(t0=%rsi +movq 112(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(t1=int64#3 +# asm 2: movq 120(t1=%rdx +movq 120(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 96(t0=%rsi +movq 96(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(t1=int64#3 +# asm 2: movq 104(t1=%rdx +movq 104(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 80(t0=%rsi +movq 80(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(t1=int64#3 +# asm 2: movq 88(t1=%rdx +movq 88(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 64(t0=%rsi +movq 64(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(t1=int64#3 +# asm 2: movq 72(t1=%rdx +movq 72(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 48(t0=%rsi +movq 48(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(t1=int64#3 +# asm 2: movq 56(t1=%rdx +movq 56(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 32(t0=%rsi +movq 32(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(t1=int64#3 +# asm 2: movq 40(t1=%rdx +movq 40(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 16(t0=%rsi +movq 16(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(t1=int64#3 +# asm 2: movq 24(t1=%rdx +movq 24(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 0(t0=%rsi +movq 0(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(t1=int64#1 +# asm 2: movq 8(t1=%rdi +movq 8(%rdi),%rdi + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rsi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1, +#include + +#include "aes.h" + + +void PQCLEAN_MCELIECE8192128F_AVX_aes256ctr( + uint8_t *out, + size_t outlen, + const uint8_t nonce[AESCTR_NONCEBYTES], + const uint8_t key[AES256_KEYBYTES] +); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/api.h b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/api.h new file mode 100644 index 0000000000..3c8645bb3e --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_API_H +#define PQCLEAN_MCELIECE8192128F_AVX_API_H + +#include + +#define PQCLEAN_MCELIECE8192128F_AVX_CRYPTO_ALGNAME "Classic McEliece 8192128f" +#define PQCLEAN_MCELIECE8192128F_AVX_CRYPTO_PUBLICKEYBYTES 1357824 +#define PQCLEAN_MCELIECE8192128F_AVX_CRYPTO_SECRETKEYBYTES 14080 +#define PQCLEAN_MCELIECE8192128F_AVX_CRYPTO_CIPHERTEXTBYTES 240 +#define PQCLEAN_MCELIECE8192128F_AVX_CRYPTO_BYTES 32 + + +int PQCLEAN_MCELIECE8192128F_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +); + +int PQCLEAN_MCELIECE8192128F_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +); + +int PQCLEAN_MCELIECE8192128F_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/benes.c b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/benes.c new file mode 100644 index 0000000000..f371d057fc --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/benes.c @@ -0,0 +1,311 @@ +/* + This file is for Benes network related functions +*/ + +#include "benes.h" + +#include "params.h" +#include "transpose.h" +#include "util.h" + +static void layer_x(vec128 *data, vec128 *bits) { + int i; + vec128 v0, v1; + vec128 d; + + for (i = 0; i < 64; i += 2) { + v0 = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_low(data[i + 0], data[i + 1]); + v1 = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_high(data[i + 0], data[i + 1]); + + d = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(v0, v1); + d = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(d, *bits++); + v0 = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(v0, d); + v1 = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(v1, d); + + data[i + 0] = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_low(v0, v1); + data[i + 1] = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_high(v0, v1); + } +} + +static void layer_0(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 2) { + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x ], bs[ x + 1 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, *cond++); + bs[ x ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x ], diff); + bs[ x + 1 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 1 ], diff); + } +} + +static void layer_1(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 4) { + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 2 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 2 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 2 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 3 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 3 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 3 ], diff); + + cond += 2; + } +} + +static void layer_2(vec128 *bs, vec128 *cond) { + int x; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 8) { + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 0 ], bs[ x + 4 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[0]); + bs[ x + 0 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 0 ], diff); + bs[ x + 4 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 4 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 1 ], bs[ x + 5 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[1]); + bs[ x + 1 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 1 ], diff); + bs[ x + 5 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 5 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 2 ], bs[ x + 6 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[2]); + bs[ x + 2 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 2 ], diff); + bs[ x + 6 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 6 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 3 ], bs[ x + 7 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[3]); + bs[ x + 3 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 3 ], diff); + bs[ x + 7 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ x + 7 ], diff); + + cond += 4; + } +} + +static void layer_3(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 16) { + for (s = x; s < x + 8; s += 4) { + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 8 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 8 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 8 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 9 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 9 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 9 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 10 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 10 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 10 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 11 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 11 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 11 ], diff); + + cond += 4; + } + } +} + +static void layer_4(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 32) { + for (s = x; s < x + 16; s += 4) { + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 16 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 16 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 16 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 17 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 17 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 17 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 18 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 18 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 18 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 19 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 19 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 19 ], diff); + + cond += 4; + } + } +} + +static void layer_5(vec128 *bs, vec128 *cond) { + int x, s; + vec128 diff; + + for (x = 0; x < (1 << 6); x += 64) { + for (s = x; s < x + 32; s += 4) { + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 0 ], bs[ s + 32 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[0]); + bs[ s + 0 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 0 ], diff); + bs[ s + 32 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 32 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 1 ], bs[ s + 33 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[1]); + bs[ s + 1 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 1 ], diff); + bs[ s + 33 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 33 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 2 ], bs[ s + 34 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[2]); + bs[ s + 2 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 2 ], diff); + bs[ s + 34 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 34 ], diff); + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 3 ], bs[ s + 35 ]); + diff = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(diff, cond[3]); + bs[ s + 3 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 3 ], diff); + bs[ s + 35 ] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(bs[ s + 35 ], diff); + + cond += 4; + } + } +} + +/* input: bits, control bits as array of bytes */ +/* output: bits_int, control bits as array of 128-bit vectors */ +void PQCLEAN_MCELIECE8192128F_AVX_load_bits(vec128 bits_int[][32], const unsigned char *bits) { + int i, j; + const unsigned char *ptr = bits; + + vec128 buf[64]; + + for (i = 0; i <= 5; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(PQCLEAN_MCELIECE8192128F_AVX_load8(ptr), PQCLEAN_MCELIECE8192128F_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE8192128F_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } + + for (i = 6; i <= 18; i++) { + for (j = 0; j < 32; j++) { + bits_int[i][j] = PQCLEAN_MCELIECE8192128F_AVX_load16(ptr); + ptr += 16; + } + } + + for (i = 19; i < 25; i += 2) { + for (j = 0; j < 64; j++) { + buf[j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(PQCLEAN_MCELIECE8192128F_AVX_load8(ptr), PQCLEAN_MCELIECE8192128F_AVX_load8(ptr + 512)); + ptr += 8; + } + + PQCLEAN_MCELIECE8192128F_AVX_transpose_64x128_sp( buf ); + + for (j = 0; j < 32; j++) { + bits_int[i + 0][j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_low(buf[j], buf[j + 32]); + bits_int[i + 1][j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_high(buf[j], buf[j + 32]); + } + + ptr += 512; + } +} + +/* input: r, sequence of bits to be permuted */ +/* b, control bits as array of 128-bit vectors */ +/* rev, 0 for normal application; !0 for inverse */ +/* output: r, permuted bits */ +void PQCLEAN_MCELIECE8192128F_AVX_benes(vec128 *r, vec128 b[][32], int rev) { + int inc; + + vec128 *b_ptr; + + if (rev == 0) { + inc = 32; + b_ptr = b[ 0]; + } else { + inc = -32; + b_ptr = b[24]; + } + + // + + PQCLEAN_MCELIECE8192128F_AVX_transpose_64x128_sp( r ); + + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE8192128F_AVX_transpose_64x128_sp( r ); + + layer_x(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + b_ptr += inc; + layer_x(r, b_ptr); + b_ptr += inc; + + PQCLEAN_MCELIECE8192128F_AVX_transpose_64x128_sp( r ); + + layer_5(r, b_ptr); + b_ptr += inc; + layer_4(r, b_ptr); + b_ptr += inc; + layer_3(r, b_ptr); + b_ptr += inc; + layer_2(r, b_ptr); + b_ptr += inc; + layer_1(r, b_ptr); + b_ptr += inc; + layer_0(r, b_ptr); + //b_ptr += inc; + + PQCLEAN_MCELIECE8192128F_AVX_transpose_64x128_sp( r ); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/benes.h b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/benes.h new file mode 100644 index 0000000000..b422e0f38f --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/benes.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_BENES_H +#define PQCLEAN_MCELIECE8192128F_AVX_BENES_H +/* + This file is for Benes network related functions +*/ + + +#include "vec128.h" + +void PQCLEAN_MCELIECE8192128F_AVX_load_bits(vec128 /*bits_int*/[][32], const unsigned char * /*bits*/); +void PQCLEAN_MCELIECE8192128F_AVX_benes(vec128 * /*r*/, vec128 /*b*/[][32], int /*rev*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/bm.c b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/bm.c new file mode 100644 index 0000000000..9b83e11e82 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/bm.c @@ -0,0 +1,219 @@ +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "bm.h" + +#include "gf.h" +#include "params.h" +#include "vec128.h" + +#include + +extern gf PQCLEAN_MCELIECE8192128F_AVX_vec_reduce_asm(vec128 *); +extern void PQCLEAN_MCELIECE8192128F_AVX_update_asm(void *, gf, int); + +static inline uint16_t mask_nonzero(gf a) { + uint32_t ret = a; + + ret -= 1; + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline uint16_t mask_leq(uint16_t a, uint16_t b) { + uint32_t a_tmp = a; + uint32_t b_tmp = b; + uint32_t ret = b_tmp - a_tmp; + + ret >>= 31; + ret -= 1; + + return ret; +} + +static inline void vec128_cmov(vec128 out[][2], uint16_t mask) { + int i; + + vec128 v0, v1; + + vec128 m0 = PQCLEAN_MCELIECE8192128F_AVX_vec128_set1_16b( mask); + vec128 m1 = PQCLEAN_MCELIECE8192128F_AVX_vec128_set1_16b(~mask); + + for (i = 0; i < GFBITS; i++) { + v0 = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(out[i][1], m0); + v1 = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(out[i][0], m1); + out[i][0] = PQCLEAN_MCELIECE8192128F_AVX_vec128_or(v0, v1); + } +} + +static inline void interleave(vec256 *in, int idx0, int idx1, vec256 *mask, int b) { + int s = 1 << b; + + vec256 x, y; + + x = PQCLEAN_MCELIECE8192128F_AVX_vec256_or(PQCLEAN_MCELIECE8192128F_AVX_vec256_and(in[idx0], mask[0]), + PQCLEAN_MCELIECE8192128F_AVX_vec256_sll_4x(PQCLEAN_MCELIECE8192128F_AVX_vec256_and(in[idx1], mask[0]), s)); + + y = PQCLEAN_MCELIECE8192128F_AVX_vec256_or(PQCLEAN_MCELIECE8192128F_AVX_vec256_srl_4x(PQCLEAN_MCELIECE8192128F_AVX_vec256_and(in[idx0], mask[1]), s), + PQCLEAN_MCELIECE8192128F_AVX_vec256_and(in[idx1], mask[1])); + + in[idx0] = x; + in[idx1] = y; +} + +/* input: in, field elements in bitsliced form */ +/* output: out, field elements in non-bitsliced form */ +static inline void get_coefs(gf *out, vec256 *in) { + int i, k; + + vec256 mask[4][2]; + vec256 buf[16]; + + for (i = 0; i < 13; i++) { + buf[i] = in[i]; + } + for (i = 13; i < 16; i++) { + buf[i] = PQCLEAN_MCELIECE8192128F_AVX_vec256_setzero(); + } + + mask[0][0] = PQCLEAN_MCELIECE8192128F_AVX_vec256_set1_16b(0x5555); + mask[0][1] = PQCLEAN_MCELIECE8192128F_AVX_vec256_set1_16b(0xAAAA); + mask[1][0] = PQCLEAN_MCELIECE8192128F_AVX_vec256_set1_16b(0x3333); + mask[1][1] = PQCLEAN_MCELIECE8192128F_AVX_vec256_set1_16b(0xCCCC); + mask[2][0] = PQCLEAN_MCELIECE8192128F_AVX_vec256_set1_16b(0x0F0F); + mask[2][1] = PQCLEAN_MCELIECE8192128F_AVX_vec256_set1_16b(0xF0F0); + mask[3][0] = PQCLEAN_MCELIECE8192128F_AVX_vec256_set1_16b(0x00FF); + mask[3][1] = PQCLEAN_MCELIECE8192128F_AVX_vec256_set1_16b(0xFF00); + + interleave(buf, 0, 8, mask[3], 3); + interleave(buf, 1, 9, mask[3], 3); + interleave(buf, 2, 10, mask[3], 3); + interleave(buf, 3, 11, mask[3], 3); + interleave(buf, 4, 12, mask[3], 3); + interleave(buf, 5, 13, mask[3], 3); + interleave(buf, 6, 14, mask[3], 3); + interleave(buf, 7, 15, mask[3], 3); + + interleave(buf, 0, 4, mask[2], 2); + interleave(buf, 1, 5, mask[2], 2); + interleave(buf, 2, 6, mask[2], 2); + interleave(buf, 3, 7, mask[2], 2); + interleave(buf, 8, 12, mask[2], 2); + interleave(buf, 9, 13, mask[2], 2); + interleave(buf, 10, 14, mask[2], 2); + interleave(buf, 11, 15, mask[2], 2); + + interleave(buf, 0, 2, mask[1], 1); + interleave(buf, 1, 3, mask[1], 1); + interleave(buf, 4, 6, mask[1], 1); + interleave(buf, 5, 7, mask[1], 1); + interleave(buf, 8, 10, mask[1], 1); + interleave(buf, 9, 11, mask[1], 1); + interleave(buf, 12, 14, mask[1], 1); + interleave(buf, 13, 15, mask[1], 1); + + interleave(buf, 0, 1, mask[0], 0); + interleave(buf, 2, 3, mask[0], 0); + interleave(buf, 4, 5, mask[0], 0); + interleave(buf, 6, 7, mask[0], 0); + interleave(buf, 8, 9, mask[0], 0); + interleave(buf, 10, 11, mask[0], 0); + interleave(buf, 12, 13, mask[0], 0); + interleave(buf, 14, 15, mask[0], 0); + + for (i = 0; i < 16; i++) { + for (k = 0; k < 4; k++) { + out[ (4 * 0 + k) * 16 + i ] = (PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(buf[i], 0) >> (k * 16)) & GFMASK; + out[ (4 * 1 + k) * 16 + i ] = (PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(buf[i], 1) >> (k * 16)) & GFMASK; + out[ (4 * 2 + k) * 16 + i ] = (PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(buf[i], 2) >> (k * 16)) & GFMASK; + out[ (4 * 3 + k) * 16 + i ] = (PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(buf[i], 3) >> (k * 16)) & GFMASK; + } + } +} + +typedef union { + vec128 as_128[GFBITS][2]; + vec256 as_256[GFBITS]; +} aligned_double_vec128; + +/* input: in, sequence of field elements */ +/* output: out, minimal polynomial of in */ +void PQCLEAN_MCELIECE8192128F_AVX_bm(vec128 *out, vec256 *in) { + int i; + uint16_t N, L; + uint16_t mask; + uint64_t one = 1, t; + + vec128 prod[ GFBITS ]; + vec128 interval[GFBITS]; + + aligned_double_vec128 db; + aligned_double_vec128 BC_tmp; + aligned_double_vec128 BC; + + gf d, b, c0 = 1; + gf coefs[256]; + + // initialization + + get_coefs(coefs, in); + + BC.as_128[0][0] = PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0, one << 63); + BC.as_128[0][1] = PQCLEAN_MCELIECE8192128F_AVX_vec128_setzero(); + + for (i = 1; i < GFBITS; i++) { + BC.as_128[i][0] = BC.as_128[i][1] = PQCLEAN_MCELIECE8192128F_AVX_vec128_setzero(); + } + + b = 1; + L = 0; + + // + + for (i = 0; i < GFBITS; i++) { + interval[i] = PQCLEAN_MCELIECE8192128F_AVX_vec128_setzero(); + } + + for (N = 0; N < 256; N++) { + PQCLEAN_MCELIECE8192128F_AVX_vec128_mul_asm(prod, interval, BC.as_128[0] + 1, 32); + PQCLEAN_MCELIECE8192128F_AVX_update_asm(interval, coefs[N], 16); + + d = PQCLEAN_MCELIECE8192128F_AVX_vec_reduce_asm(prod); + + t = PQCLEAN_MCELIECE8192128F_AVX_gf_mul2(c0, coefs[N], b); + d ^= t & 0xFFFFFFFF; + + mask = mask_nonzero(d) & mask_leq(L * 2, N); + + for (i = 0; i < GFBITS; i++) { + db.as_128[i][0] = PQCLEAN_MCELIECE8192128F_AVX_vec128_setbits((d >> i) & 1); + db.as_128[i][1] = PQCLEAN_MCELIECE8192128F_AVX_vec128_setbits((b >> i) & 1); + } + + PQCLEAN_MCELIECE8192128F_AVX_vec256_mul(BC_tmp.as_256, db.as_256, BC.as_256); + + vec128_cmov(BC.as_128, mask); + PQCLEAN_MCELIECE8192128F_AVX_update_asm(BC.as_128, c0 & mask, 32); + + for (i = 0; i < GFBITS; i++) { + BC.as_128[i][1] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(BC_tmp.as_128[i][0], BC_tmp.as_128[i][1]); + } + + c0 = t >> 32; + b = (d & mask) | (b & ~mask); + L = ((N + 1 - L) & mask) | (L & ~mask); + } + + c0 = PQCLEAN_MCELIECE8192128F_AVX_gf_inv(c0); + + for (i = 0; i < GFBITS; i++) { + prod[i] = PQCLEAN_MCELIECE8192128F_AVX_vec128_setbits((c0 >> i) & 1); + } + + PQCLEAN_MCELIECE8192128F_AVX_vec128_mul_asm(out, prod, BC.as_128[0] + 1, 32); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/bm.h b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/bm.h new file mode 100644 index 0000000000..7450716d49 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/bm.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_BM_H +#define PQCLEAN_MCELIECE8192128F_AVX_BM_H +/* + This file is for the inversion-free Berlekamp-Massey algorithm + see https://ieeexplore.ieee.org/document/87857 +*/ + +#include "vec128.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE8192128F_AVX_bm(vec128 * /*out*/, vec256 * /*in*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/consts.S b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/consts.S new file mode 100644 index 0000000000..57ba3d8b6e --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/consts.S @@ -0,0 +1,33 @@ +.data + +# not supported on MacOS +#.section .rodata + +.globl PQCLEAN_MCELIECE8192128F_AVX_MASK0_0 +.globl PQCLEAN_MCELIECE8192128F_AVX_MASK0_1 +.globl PQCLEAN_MCELIECE8192128F_AVX_MASK1_0 +.globl PQCLEAN_MCELIECE8192128F_AVX_MASK1_1 +.globl PQCLEAN_MCELIECE8192128F_AVX_MASK2_0 +.globl PQCLEAN_MCELIECE8192128F_AVX_MASK2_1 +.globl PQCLEAN_MCELIECE8192128F_AVX_MASK3_0 +.globl PQCLEAN_MCELIECE8192128F_AVX_MASK3_1 +.globl PQCLEAN_MCELIECE8192128F_AVX_MASK4_0 +.globl PQCLEAN_MCELIECE8192128F_AVX_MASK4_1 +.globl PQCLEAN_MCELIECE8192128F_AVX_MASK5_0 +.globl PQCLEAN_MCELIECE8192128F_AVX_MASK5_1 + +.p2align 5 + +PQCLEAN_MCELIECE8192128F_AVX_MASK0_0: .quad 0x5555555555555555, 0x5555555555555555, 0x5555555555555555, 0x5555555555555555 +PQCLEAN_MCELIECE8192128F_AVX_MASK0_1: .quad 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA, 0xAAAAAAAAAAAAAAAA +PQCLEAN_MCELIECE8192128F_AVX_MASK1_0: .quad 0x3333333333333333, 0x3333333333333333, 0x3333333333333333, 0x3333333333333333 +PQCLEAN_MCELIECE8192128F_AVX_MASK1_1: .quad 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC, 0xCCCCCCCCCCCCCCCC +PQCLEAN_MCELIECE8192128F_AVX_MASK2_0: .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F +PQCLEAN_MCELIECE8192128F_AVX_MASK2_1: .quad 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0 +PQCLEAN_MCELIECE8192128F_AVX_MASK3_0: .quad 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF +PQCLEAN_MCELIECE8192128F_AVX_MASK3_1: .quad 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 +PQCLEAN_MCELIECE8192128F_AVX_MASK4_0: .quad 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF +PQCLEAN_MCELIECE8192128F_AVX_MASK4_1: .quad 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 +PQCLEAN_MCELIECE8192128F_AVX_MASK5_0: .quad 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF +PQCLEAN_MCELIECE8192128F_AVX_MASK5_1: .quad 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000 + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/consts.inc b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/consts.inc new file mode 100644 index 0000000000..fabf20e2ff --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/consts.inc @@ -0,0 +1,502 @@ +//64 +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC, 0XCC3333CCCC3333CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9966669966999966, 0X9966669966999966, 0X9966669966999966, 0X9966669966999966), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6666666666666666, 0X6666666666666666, 0X6666666666666666, 0X6666666666666666), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A) +}, +//128 +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X9999999966666666, 0X6666666699999999), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33, 0XCC33CC3333CC33CC, 0X33CC33CCCC33CC33), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C, 0X3C3C3C3C3C3C3C3C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA, 0XAA5555AAAA5555AA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C, 0XC33C3CC33CC3C33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF) +}, +//256 +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF0F00FF00F0FF0, 0X0FF0F00FF00F0FF0, 0XF00F0FF00FF0F00F, 0XF00F0FF00FF0F00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9669966969966996, 0X6996699696699669, 0X6996699696699669, 0X9669966969966996), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC, 0X33333333CCCCCCCC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0X5AA5A55AA55A5AA5, 0X5AA5A55AA55A5AA5, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00FFFF0000FFFF00, 0XFF0000FFFF0000FF, 0XFF0000FFFF0000FF, 0X00FFFF0000FFFF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C, 0XC33CC33CC33CC33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555) +}, +//512 +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9966669966999966, 0X6699996699666699, 0X9966669966999966, 0X6699996699666699), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9966996699669966, 0X6699669966996699, 0X6699669966996699, 0X9966996699669966), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA, 0XAA55AA5555AA55AA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3, 0XC33C3CC33CC3C33C, 0X3CC3C33CC33C3CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6699996699666699, 0X9966669966999966, 0X6699996699666699, 0X9966669966999966), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6699669966996699, 0X9966996699669966, 0X9966996699669966, 0X6699669966996699), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6969969669699696, 0X6969969669699696, 0X6969969669699696, 0X6969969669699696), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55, 0X55AA55AAAA55AA55), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9966996699669966, 0X9966996699669966, 0X6699669966996699, 0X6699669966996699), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5, 0X5AA5A55A5AA5A55A, 0XA55A5AA5A55A5AA5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC3C3C3C33C3C3C3C, 0XC3C3C3C33C3C3C3C, 0X3C3C3C3CC3C3C3C3, 0X3C3C3C3CC3C3C3C3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3333CCCC3333CCCC, 0X3333CCCC3333CCCC, 0XCCCC3333CCCC3333, 0XCCCC3333CCCC3333), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9999999966666666, 0X6666666699999999, 0X6666666699999999, 0X9999999966666666), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3, 0XC33CC33CC33CC33C, 0X3CC33CC33CC33CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6666999999996666, 0X9999666666669999, 0X9999666666669999, 0X6666999999996666) +}, +//1024 +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55, 0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0, 0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9669699696696996, 0X9669699696696996, 0X9669699696696996, 0X9669699696696996), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X55555555AAAAAAAA, 0X55555555AAAAAAAA, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3, 0X3C3CC3C3C3C33C3C, 0XC3C33C3C3C3CC3C3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0XAAAAAAAA55555555), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X55AAAA55AA5555AA, 0X55AAAA55AA5555AA, 0XAA5555AA55AAAA55, 0XAA5555AA55AAAA55), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF0F00F0FF0F00F, 0X0FF0F00F0FF0F00F, 0XF00F0FF0F00F0FF0, 0XF00F0FF0F00F0FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6996966969969669, 0X6996966969969669, 0X6996966969969669, 0X6996966969969669), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55AA55AA55AA55A, 0X5AA55AA55AA55AA5, 0X5AA55AA55AA55AA5, 0XA55AA55AA55AA55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAAAAAA55555555, 0XAAAAAAAA55555555, 0X55555555AAAAAAAA, 0X55555555AAAAAAAA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCCCC33333333CCCC, 0X3333CCCCCCCC3333, 0X3333CCCCCCCC3333, 0XCCCC33333333CCCC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0XFFFF00000000FFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6996699669966996, 0X9669966996699669, 0X9669966996699669, 0X6996699669966996) +}, +//2048 +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3, 0XC33C3CC3C33C3CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55, 0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F, 0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5555555555555555, 0X5555555555555555, 0X5555555555555555, 0X5555555555555555), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C, 0X3CC3C33C3CC3C33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X55AA55AA55AA55AA, 0X55AA55AA55AA55AA, 0XAA55AA55AA55AA55, 0XAA55AA55AA55AA55), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F0F0F00F0F0F0F, 0X0F0F0F0FF0F0F0F0, 0X0F0F0F0FF0F0F0F0, 0XF0F0F0F00F0F0F0F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33CCCC33CC3333CC, 0XCC3333CC33CCCC33, 0XCC3333CC33CCCC33, 0X33CCCC33CC3333CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF, 0XFF0000FF00FFFF00, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6996966996696996, 0X9669699669969669, 0X6996966996696996, 0X9669699669969669), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A, 0XA55A5AA55AA5A55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6996966996696996, 0X6996966996696996, 0X6996966996696996, 0X6996966996696996) +}, +//4096 +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00, 0XFF00FF00FF00FF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0, 0XF0F0F0F0F0F0F0F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC, 0XCCCCCCCCCCCCCCCC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA, 0XAAAAAAAAAAAAAAAA) +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/controlbits.c b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/controlbits.c new file mode 100644 index 0000000000..c99b54fe84 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/controlbits.c @@ -0,0 +1,274 @@ +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + +#include "controlbits.h" + +#include "params.h" + +#include + +typedef uint8_t bit; + +#define N (1 << GFBITS) + +static bit is_smaller(uint32_t a, uint32_t b) { + uint32_t ret = 0; + + ret = a - b; + ret >>= 31; + + return (bit)ret; +} + +static bit is_smaller_63b(uint64_t a, uint64_t b) { + uint64_t ret = 0; + + ret = a - b; + ret >>= 63; + + return (bit)ret; +} + +static void cswap(uint32_t *x, uint32_t *y, bit swap) { + uint32_t m; + uint32_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +static void cswap_63b(uint64_t *x, uint64_t *y, bit swap) { + uint64_t m; + uint64_t d; + + m = swap; + m = 0 - m; + + d = (*x ^ *y); + d &= m; + *x ^= d; + *y ^= d; +} + +/* output x = min(input x,input y) */ +/* output y = max(input x,input y) */ + +static void minmax(uint32_t *x, uint32_t *y) { + bit m; + + m = is_smaller(*y, *x); + cswap(x, y, m); +} + +static void minmax_63b(uint64_t *x, uint64_t *y) { + bit m; + + m = is_smaller_63b(*y, *x); + cswap_63b(x, y, m); +} + +/* merge first half of x[0],x[step],...,x[(2*n-1)*step] with second half */ +/* requires n to be a power of 2 */ + +static void merge(int n, uint32_t *x, int step) { + int i; + if (n == 1) { + minmax(&x[0], &x[step]); + } else { + merge(n / 2, x, step * 2); + merge(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax(&x[i * step], &x[(i + 1) * step]); + } + } +} + +static void merge_63b(int n, uint64_t *x, int step) { + int i; + if (n == 1) { + minmax_63b(&x[0], &x[step]); + } else { + merge_63b(n / 2, x, step * 2); + merge_63b(n / 2, x + step, step * 2); + for (i = 1; i < 2 * n - 1; i += 2) { + minmax_63b(&x[i * step], &x[(i + 1) * step]); + } + } +} + +/* sort x[0],x[1],...,x[n-1] in place */ +/* requires n to be a power of 2 */ + +static void sort(int n, uint32_t *x) { + if (n <= 1) { + return; + } + sort(n / 2, x); + sort(n / 2, x + n / 2); + merge(n / 2, x, 1); +} + +void PQCLEAN_MCELIECE8192128F_AVX_sort_63b(int n, uint64_t *x) { + if (n <= 1) { + return; + } + PQCLEAN_MCELIECE8192128F_AVX_sort_63b(n / 2, x); + PQCLEAN_MCELIECE8192128F_AVX_sort_63b(n / 2, x + n / 2); + merge_63b(n / 2, x, 1); +} + +/* y[pi[i]] = x[i] */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void composeinv(int n, uint32_t *y, const uint32_t *x, const uint32_t *pi) { // NC + int i; + uint32_t t[2 * N]; + + for (i = 0; i < n; ++i) { + t[i] = x[i] | (pi[i] << 16); + } + + sort(n, t); + + for (i = 0; i < n; ++i) { + y[i] = t[i] & 0xFFFF; + } +} + +/* ip[i] = j iff pi[i] = j */ +/* requires n = 2^w */ +/* requires pi to be a permutation */ +static void invert(int n, uint32_t *ip, const uint32_t *pi) { + int i; + + for (i = 0; i < n; i++) { + ip[i] = i; + } + + composeinv(n, ip, ip, pi); +} + + +static void flow(int w, uint32_t *x, const uint32_t *y, int t) { + bit m0; + bit m1; + + uint32_t b; + uint32_t y_copy = *y; + + m0 = is_smaller(*y & ((1 << w) - 1), *x & ((1 << w) - 1)); + m1 = is_smaller(0, t); + + cswap(x, &y_copy, m0); + b = m0 & m1; + *x ^= b << w; +} + +/* input: permutation pi */ +/* output: (2w-1)n/2 (or 0 if n==1) control bits c[0],c[step],c[2*step],... */ +/* requires n = 2^w */ +static void controlbitsfrompermutation(int w, int n, int step, int off, unsigned char *c, const uint32_t *pi) { + int i; + int j; + int k; + int t; + uint32_t ip[N] = {0}; + uint32_t I[2 * N] = {0}; + uint32_t P[2 * N] = {0}; + uint32_t PI[2 * N] = {0}; + uint32_t T[2 * N] = {0}; + uint32_t piflip[N] = {0}; + uint32_t subpi[2][N / 2] = {{0}}; + + if (w == 1) { + c[ off / 8 ] |= (pi[0] & 1) << (off % 8); + } + if (w <= 1) { + return; + } + + invert(n, ip, pi); + + for (i = 0; i < n; ++i) { + I[i] = ip[i] | (1 << w); + I[n + i] = pi[i]; + } + + for (i = 0; i < 2 * n; ++i) { + P[i] = (i >> w) + (i & ((1 << w) - 2)) + ((i & 1) << w); + } + + for (t = 0; t < w; ++t) { + composeinv(2 * n, PI, P, I); + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &PI[i], t); + } + + for (i = 0; i < 2 * n; ++i) { + T[i] = I[i ^ 1]; + } + + composeinv(2 * n, I, I, T); + + for (i = 0; i < 2 * n; ++i) { + T[i] = P[i ^ 1]; + } + + for (i = 0; i < 2 * n; ++i) { + flow(w, &P[i], &T[i], 1); + } + } + + for (i = 0; i < n; ++i) { + for (j = 0; j < w; ++j) { + piflip[i] = pi[i]; + } + } + + for (i = 0; i < n / 2; ++i) { + c[ (off + i * step) / 8 ] |= ((P[i * 2] >> w) & 1) << ((off + i * step) % 8); + } + for (i = 0; i < n / 2; ++i) { + c[ (off + ((w - 1)*n + i) * step) / 8 ] |= ((P[n + i * 2] >> w) & 1) << ((off + ((w - 1) * n + i) * step) % 8); + } + + for (i = 0; i < n / 2; ++i) { + cswap(&piflip[i * 2], &piflip[i * 2 + 1], (P[n + i * 2] >> w) & 1); + } + + for (k = 0; k < 2; ++k) { + for (i = 0; i < n / 2; ++i) { + subpi[k][i] = piflip[i * 2 + k] >> 1; + } + } + + for (k = 0; k < 2; ++k) { + controlbitsfrompermutation(w - 1, n / 2, step * 2, off + step * (n / 2 + k), c, subpi[k]); + } +} + +/* input: pi, a permutation*/ +/* output: out, control bits w.r.t. pi */ +void PQCLEAN_MCELIECE8192128F_AVX_controlbits(unsigned char *out, const uint32_t *pi) { + unsigned int i; + unsigned char c[ (2 * GFBITS - 1) * (1 << GFBITS) / 16 ]; + + for (i = 0; i < sizeof(c); i++) { + c[i] = 0; + } + + controlbitsfrompermutation(GFBITS, (1 << GFBITS), 1, 0, c, pi); + + for (i = 0; i < sizeof(c); i++) { + out[i] = c[i]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/controlbits.h b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/controlbits.h new file mode 100644 index 0000000000..cd2054cc6f --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/controlbits.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_CONTROLBITS_H +#define PQCLEAN_MCELIECE8192128F_AVX_CONTROLBITS_H +/* + This file is for functions required for generating the control bits of the Benes network w.r.t. a random permutation + see the Lev-Pippenger-Valiant paper https://www.computer.org/csdl/trans/tc/1981/02/06312171.pdf +*/ + + +#include + +void PQCLEAN_MCELIECE8192128F_AVX_sort_63b(int n, uint64_t *x); +void PQCLEAN_MCELIECE8192128F_AVX_controlbits(unsigned char *out, const uint32_t *pi); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/crypto_hash.h b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/crypto_hash.h new file mode 100644 index 0000000000..3f48edda88 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/crypto_hash.h @@ -0,0 +1,7 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_CRYPTO_HASH_H +#define PQCLEAN_MCELIECE8192128F_AVX_CRYPTO_HASH_H +#include "fips202.h" + +#define crypto_hash_32b(out,in,inlen) shake256(out, 32, in, inlen) + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/decrypt.c b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/decrypt.c new file mode 100644 index 0000000000..87a8b7c70d --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/decrypt.c @@ -0,0 +1,207 @@ +/* + This file is for Niederreiter decryption +*/ + +#include "decrypt.h" + +#include "benes.h" +#include "bm.h" +#include "fft.h" +#include "fft_tr.h" +#include "params.h" +#include "util.h" + +#include + +static void scaling(vec256 out[][GFBITS], vec256 inv[][GFBITS], const unsigned char *sk, vec256 *recv) { + int i, j; + + vec128 sk_int[ GFBITS ]; + vec256 eval[32][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + // computing inverses + + PQCLEAN_MCELIECE8192128F_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE8192128F_AVX_fft(eval, sk_int); + + for (i = 0; i < 32; i++) { + PQCLEAN_MCELIECE8192128F_AVX_vec256_sq(eval[i], eval[i]); + } + + PQCLEAN_MCELIECE8192128F_AVX_vec256_copy(inv[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE8192128F_AVX_vec256_mul(inv[i], inv[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE8192128F_AVX_vec256_inv(tmp, inv[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE8192128F_AVX_vec256_mul(inv[i + 1], tmp, inv[i]); + PQCLEAN_MCELIECE8192128F_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE8192128F_AVX_vec256_copy(inv[0], tmp); + + // + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE8192128F_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static void scaling_inv(vec256 out[][GFBITS], vec256 inv[][GFBITS], vec256 *recv) { + int i, j; + + for (i = 0; i < 32; i++) { + for (j = 0; j < GFBITS; j++) { + out[i][j] = PQCLEAN_MCELIECE8192128F_AVX_vec256_and(inv[i][j], recv[i]); + } + } +} + +static void preprocess(vec128 *recv, const unsigned char *s) { + int i; + + recv[0] = PQCLEAN_MCELIECE8192128F_AVX_vec128_setbits(0); + + for (i = 1; i < 64; i++) { + recv[i] = recv[0]; + } + + for (i = 0; i < SYND_BYTES / 16; i++) { + recv[i] = PQCLEAN_MCELIECE8192128F_AVX_load16(s + i * 16); + } +} + +static int weight(vec256 *v) { + int i, w = 0; + + for (i = 0; i < 32; i++) { + w += (int)_mm_popcnt_u64( PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(v[i], 0) ); + w += (int)_mm_popcnt_u64( PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(v[i], 1) ); + w += (int)_mm_popcnt_u64( PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(v[i], 2) ); + w += (int)_mm_popcnt_u64( PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(v[i], 3) ); + } + + return w; +} + +static uint16_t synd_cmp(vec256 *s0, vec256 *s1) { + int i; + vec256 diff; + + diff = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(s0[0], s1[0]); + + for (i = 1; i < GFBITS; i++) { + diff = PQCLEAN_MCELIECE8192128F_AVX_vec256_or(diff, PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(s0[i], s1[i])); + } + + return PQCLEAN_MCELIECE8192128F_AVX_vec256_testz(diff); +} + +static void reformat_128to256(vec256 *out, vec128 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(in[2 * i + 0], 0); + v[1] = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(in[2 * i + 0], 1); + v[2] = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(in[2 * i + 1], 0); + v[3] = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(in[2 * i + 1], 1); + + out[i] = PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +static void reformat_256to128(vec128 *out, vec256 *in) { + int i; + uint64_t v[4]; + + for (i = 0; i < 32; i++) { + v[0] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 3); + + out[2 * i + 0] = PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(v[0], v[1]); + out[2 * i + 1] = PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(v[2], v[3]); + } +} + +/* Niederreiter decryption with the Berlekamp decoder */ +/* intput: sk, secret key */ +/* c, ciphertext (syndrome) */ +/* output: e, error vector */ +/* return: 0 for success; 1 for failure */ +int PQCLEAN_MCELIECE8192128F_AVX_decrypt(unsigned char *e, const unsigned char *sk, const unsigned char *c) { + int i; + + uint16_t check_synd; + uint16_t check_weight; + + vec256 inv[ 64 ][ GFBITS ]; + vec256 scaled[ 64 ][ GFBITS ]; + vec256 eval[ 64 ][ GFBITS ]; + + vec128 error128[ 64 ]; + vec256 error256[ 32 ]; + + vec256 s_priv[ GFBITS ]; + vec256 s_priv_cmp[ GFBITS ]; + vec128 locator[ GFBITS ]; + + vec128 recv128[ 64 ]; + vec256 recv256[ 32 ]; + vec256 allone; + + vec128 bits_int[25][32]; + + // Berlekamp decoder + + preprocess(recv128, c); + + PQCLEAN_MCELIECE8192128F_AVX_load_bits(bits_int, sk + IRR_BYTES); + PQCLEAN_MCELIECE8192128F_AVX_benes(recv128, bits_int, 1); + reformat_128to256(recv256, recv128); + + scaling(scaled, inv, sk, recv256); // scaling + PQCLEAN_MCELIECE8192128F_AVX_fft_tr(s_priv, scaled); // transposed FFT + PQCLEAN_MCELIECE8192128F_AVX_bm(locator, s_priv); // Berlekamp Massey + + PQCLEAN_MCELIECE8192128F_AVX_fft(eval, locator); // FFT + + // reencryption and weight check + + allone = PQCLEAN_MCELIECE8192128F_AVX_vec256_set1_16b(0xFFFF); + + for (i = 0; i < 32; i++) { + error256[i] = PQCLEAN_MCELIECE8192128F_AVX_vec256_or_reduce(eval[i]); + error256[i] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(error256[i], allone); + } + + check_weight = (uint16_t)(weight(error256) ^ SYS_T); + check_weight -= 1; + check_weight >>= 15; + + scaling_inv(scaled, inv, error256); + PQCLEAN_MCELIECE8192128F_AVX_fft_tr(s_priv_cmp, scaled); + + check_synd = synd_cmp(s_priv, s_priv_cmp); + + // + + reformat_256to128(error128, error256); + PQCLEAN_MCELIECE8192128F_AVX_benes(error128, bits_int, 0); + + for (i = 0; i < 64; i++) { + PQCLEAN_MCELIECE8192128F_AVX_store16(e + i * 16, error128[i]); + } + + return 1 - (check_synd & check_weight); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/decrypt.h b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/decrypt.h new file mode 100644 index 0000000000..e28a0913ef --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/decrypt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_DECRYPT_H +#define PQCLEAN_MCELIECE8192128F_AVX_DECRYPT_H +/* + This file is for Nieddereiter decryption +*/ + +int PQCLEAN_MCELIECE8192128F_AVX_decrypt(unsigned char * /*e*/, const unsigned char * /*sk*/, const unsigned char * /*c*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/encrypt.c b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/encrypt.c new file mode 100644 index 0000000000..d2a032da7d --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/encrypt.c @@ -0,0 +1,80 @@ +/* + This file is for Niederreiter encryption +*/ + +#include "encrypt.h" + +#include "int32_sort.h" +#include "params.h" +#include "randombytes.h" +#include "util.h" + +#include + +/* input: public key pk, error vector e */ +/* output: syndrome s */ +extern void PQCLEAN_MCELIECE8192128F_AVX_syndrome_asm(unsigned char *s, const unsigned char *pk, unsigned char *e); + +/* output: e, an error vector of weight t */ +static void gen_e(unsigned char *e) { + int i, j, eq; + + uint16_t ind[ SYS_T ]; + int32_t ind32[ SYS_T ]; + uint64_t e_int[ SYS_N / 64 ]; + uint64_t one = 1; + uint64_t mask; + uint64_t val[ SYS_T ]; + + while (1) { + randombytes((unsigned char *) ind, sizeof(ind)); + + for (i = 0; i < SYS_T; i++) { + ind32[i] = ind[i] &= GFMASK; + } + + // check for repetition + + PQCLEAN_MCELIECE8192128F_AVX_int32_sort(ind32, SYS_T); + + eq = 0; + for (i = 1; i < SYS_T; i++) { + if (ind32[i - 1] == ind32[i]) { + eq = 1; + } + } + + if (eq == 0) { + break; + } + } + + for (j = 0; j < SYS_T; j++) { + val[j] = one << (ind[j] & 63); + } + + for (i = 0; i < SYS_N / 64; i++) { + e_int[i] = 0; + + for (j = 0; j < SYS_T; j++) { + mask = i ^ (ind[j] >> 6); + mask -= 1; + mask >>= 63; + mask = -mask; + + e_int[i] |= val[j] & mask; + } + } + + for (i = 0; i < SYS_N / 64; i++) { + PQCLEAN_MCELIECE8192128F_AVX_store8(e + i * 8, e_int[i]); + } +} + +/* input: public key pk */ +/* output: error vector e, syndrome s */ +void PQCLEAN_MCELIECE8192128F_AVX_encrypt(unsigned char *s, unsigned char *e, const unsigned char *pk) { + gen_e(e); + PQCLEAN_MCELIECE8192128F_AVX_syndrome_asm(s, pk, e); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/encrypt.h b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/encrypt.h new file mode 100644 index 0000000000..177c6ae31b --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/encrypt.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_ENCRYPT_H +#define PQCLEAN_MCELIECE8192128F_AVX_ENCRYPT_H +/* + This file is for Niederreiter encryption +*/ + + +void PQCLEAN_MCELIECE8192128F_AVX_encrypt(unsigned char * /*s*/, unsigned char * /*e*/, const unsigned char * /*pk*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/fft.c b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/fft.c new file mode 100644 index 0000000000..33f13bd679 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/fft.c @@ -0,0 +1,275 @@ +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + +#include "fft.h" + +#include "transpose.h" +#include "vec256.h" + +#include + +/* input: in, polynomial in bitsliced form */ +/* output: in, result of applying the radix conversions on in */ +static void radix_conversions(vec128 *in) { + int i, j, k; + vec128 t; + uint64_t v0, v1; + + const vec128 mask[5][2] = { + { + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0x8888888888888888, 0x8888888888888888), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0xC0C0C0C0C0C0C0C0, 0xC0C0C0C0C0C0C0C0), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0xF000F000F000F000, 0xF000F000F000F000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0xFF000000FF000000, 0xFF000000FF000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0xFFFF000000000000, 0xFFFF000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0x0000FFFF00000000, 0x0000FFFF00000000) + } + }; + + const vec128 s[5][GFBITS] = { +#include "scalars_2x.inc" + }; + + // + + for (j = 0; j <= 5; j++) { + for (i = 0; i < GFBITS; i++) { + v1 = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(in[i], 1); + v1 ^= v1 >> 32; + v0 = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(in[i], 0); + v0 ^= v1 << 32; + in[i] = PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(v0, v1); + } + + for (i = 0; i < GFBITS; i++) { + for (k = 4; k >= j; k--) { + t = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE8192128F_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(in[i], t); + + t = PQCLEAN_MCELIECE8192128F_AVX_vec128_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE8192128F_AVX_vec128_srl_2x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(in[i], t); + } + } + + if (j < 5) { + PQCLEAN_MCELIECE8192128F_AVX_vec128_mul(in, in, s[j]); // scaling + } + } +} + +/* input: in, result of applying the radix conversions to the input polynomial */ +/* output: out, evaluation results (by applying the FFT butterflies) */ +static void butterflies(vec256 out[][ GFBITS ], vec128 *in) { + int i, j, k, s, b; + + vec128 tmp[ GFBITS ]; + vec256 tmp0[ GFBITS ]; + vec256 tmp1[ GFBITS ]; + vec128 t[ GFBITS ]; + + union { + vec128 v[8][ GFBITS + 1 ]; + vec256 V[8][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + uint64_t v0, v1; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + const vec256 powers[ 32 ][ GFBITS ] = { +#include "powers.inc" + }; + + uint64_t consts_ptr = 2; + + const unsigned char reversal[64] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[8] = {2522, 7827, 7801, 8035, 6897, 8167, 3476, 0}; + + // boradcast + + for (j = 0; j < GFBITS; j++) { + t[j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_high(in[j], in[j]); + } + + for (i = 0; i < 8; i += 2) { + for (j = 0; j < GFBITS; j++) { + v0 = (beta[i + 0] >> j) & 1; + v0 = -v0; + v1 = (beta[i + 1] >> j) & 1; + v1 = -v1; + + tmp[j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(v0, v1); + } + + PQCLEAN_MCELIECE8192128F_AVX_vec128_mul(tmp, t, tmp); + + for (j = 0; j < GFBITS; j++) { + pre.v[i + 0][j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_low(tmp[j], tmp[j]); + pre.v[i + 1][j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_high(tmp[j], tmp[j]); + } + } + + for (i = 0; i < GFBITS; i += 2) { + if (i != GFBITS - 1) { + buf.v[0][1] = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_low(in[i + 1], in[i + 1] ^ pre.v[6][i + 1]); + } + buf.v[0][0] = PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_low(in[i + 0], in[i + 0] ^ pre.v[6][i + 0]); + + buf.V[1] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[0], pre.V[0][i / 2]); + buf.V[16] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[0], pre.V[4][i / 2]); + buf.V[3] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[1], pre.V[1][i / 2]); + buf.V[48] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[16], pre.V[5][i / 2]); + buf.V[49] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[48], pre.V[0][i / 2]); + buf.V[2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[0], pre.V[1][i / 2]); + buf.V[51] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[49], pre.V[1][i / 2]); + buf.V[6] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[2], pre.V[2][i / 2]); + buf.V[50] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[51], pre.V[0][i / 2]); + buf.V[7] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[6], pre.V[0][i / 2]); + buf.V[54] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[50], pre.V[2][i / 2]); + buf.V[5] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[7], pre.V[1][i / 2]); + buf.V[55] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[54], pre.V[0][i / 2]); + buf.V[53] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[55], pre.V[1][i / 2]); + buf.V[4] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[0], pre.V[2][i / 2]); + buf.V[52] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[53], pre.V[0][i / 2]); + buf.V[12] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[4], pre.V[3][i / 2]); + buf.V[60] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[52], pre.V[3][i / 2]); + buf.V[13] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[12], pre.V[0][i / 2]); + buf.V[61] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[60], pre.V[0][i / 2]); + buf.V[15] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[13], pre.V[1][i / 2]); + buf.V[63] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[61], pre.V[1][i / 2]); + buf.V[14] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[15], pre.V[0][i / 2]); + buf.V[62] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[63], pre.V[0][i / 2]); + buf.V[10] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[14], pre.V[2][i / 2]); + buf.V[58] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[62], pre.V[2][i / 2]); + buf.V[11] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[10], pre.V[0][i / 2]); + buf.V[59] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[58], pre.V[0][i / 2]); + buf.V[9] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[11], pre.V[1][i / 2]); + buf.V[57] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[59], pre.V[1][i / 2]); + buf.V[56] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[57], pre.V[0][i / 2]); + buf.V[8] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[0], pre.V[3][i / 2]); + buf.V[40] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[56], pre.V[4][i / 2]); + buf.V[24] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[8], pre.V[4][i / 2]); + buf.V[41] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[40], pre.V[0][i / 2]); + buf.V[25] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[24], pre.V[0][i / 2]); + buf.V[43] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[41], pre.V[1][i / 2]); + buf.V[27] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[25], pre.V[1][i / 2]); + buf.V[42] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[43], pre.V[0][i / 2]); + buf.V[26] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[27], pre.V[0][i / 2]); + buf.V[46] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[42], pre.V[2][i / 2]); + buf.V[30] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[26], pre.V[2][i / 2]); + buf.V[47] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[46], pre.V[0][i / 2]); + buf.V[31] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[30], pre.V[0][i / 2]); + buf.V[45] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[47], pre.V[1][i / 2]); + buf.V[29] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[31], pre.V[1][i / 2]); + buf.V[44] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[45], pre.V[0][i / 2]); + buf.V[28] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[29], pre.V[0][i / 2]); + buf.V[36] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[44], pre.V[3][i / 2]); + buf.V[20] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[28], pre.V[3][i / 2]); + buf.V[37] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[36], pre.V[0][i / 2]); + buf.V[21] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[20], pre.V[0][i / 2]); + buf.V[39] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[37], pre.V[1][i / 2]); + buf.V[23] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[21], pre.V[1][i / 2]); + buf.V[38] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[39], pre.V[0][i / 2]); + buf.V[22] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[23], pre.V[0][i / 2]); + buf.V[34] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[38], pre.V[2][i / 2]); + buf.V[18] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[22], pre.V[2][i / 2]); + buf.V[35] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[34], pre.V[0][i / 2]); + buf.V[19] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[18], pre.V[0][i / 2]); + buf.V[33] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[35], pre.V[1][i / 2]); + buf.V[17] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[19], pre.V[1][i / 2]); + buf.V[32] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[33], pre.V[0][i / 2]); + + + // transpose + + PQCLEAN_MCELIECE8192128F_AVX_transpose_64x256_sp(buf.V); + + for (j = 0; j < 32; j++) { + if (i != GFBITS - 1) { + out[j][i + 1] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_high(buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + out[j][i + 0] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_low (buf.V[ reversal[2 * j + 0] ], buf.V[ reversal[2 * j + 1] ]); + } + } + + // butterflies + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + tmp0[b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_low (out[k][b], out[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + tmp1[b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_high (out[k][b], out[k + 1][b]); + } + + PQCLEAN_MCELIECE8192128F_AVX_vec256_maa_asm(tmp0, tmp1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + out[k][b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_low (tmp0[b], tmp1[b]); + } + for (b = 0; b < GFBITS; b++) { + out[k + 1][b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_high (tmp0[b], tmp1[b]); + } + } + + for (i = 0; i <= 4; i++) { + s = 1 << i; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE8192128F_AVX_vec256_maa_asm(out[k], out[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + consts_ptr += (1 << i); + } + + // adding the part contributed by x^128 + + for (i = 0; i < 32; i++) { + for (b = 0; b < GFBITS; b++) { + out[i][b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(out[i][b], powers[i][b]); + } + } +} + +/* input: in, polynomial in bitsliced form */ +/* output: out, bitsliced results of evaluating in all the field elements */ +void PQCLEAN_MCELIECE8192128F_AVX_fft(vec256 out[][GFBITS], vec128 *in) { + radix_conversions(in); + butterflies(out, in); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/fft.h b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/fft.h new file mode 100644 index 0000000000..b2fea14ff1 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/fft.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_FFT_H +#define PQCLEAN_MCELIECE8192128F_AVX_FFT_H +/* + This file is for the Gao-Mateer FFT + sse http://www.math.clemson.edu/~sgao/papers/GM10.pdf +*/ + + +#include "params.h" +#include "vec128.h" +#include "vec256.h" +#include + +void PQCLEAN_MCELIECE8192128F_AVX_fft(vec256 /*out*/[][GFBITS], vec128 * /*in*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/fft_tr.c b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/fft_tr.c new file mode 100644 index 0000000000..fe27866169 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/fft_tr.c @@ -0,0 +1,379 @@ +/* + This file is for transpose of the Gao-Mateer FFT + Functions with names ending with _tr are (roughly) the transpose of the corresponding functions in fft.c +*/ + +#include "fft_tr.h" + +#include "transpose.h" + +#include + +static void radix_conversions_tr(vec256 *in) { + int i, j, k; + vec256 t; + uint64_t v[4]; + + const vec256 mask[6][2] = { + { + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x2222222222222222, 0x2222222222222222, 0x2222222222222222, 0x2222222222222222), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x4444444444444444, 0x4444444444444444, 0x4444444444444444, 0x4444444444444444) + }, + { + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C, 0x0C0C0C0C0C0C0C0C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x3030303030303030, 0x3030303030303030, 0x3030303030303030, 0x3030303030303030) + }, + { + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00, 0x0F000F000F000F00) + }, + { + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00, 0x0000FF000000FF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000, 0x00FF000000FF0000) + }, + { + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000, 0x00000000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000, 0x0000FFFF00000000) + }, + { + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000, 0xFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF) + } + }; + + const vec256 s[6][GFBITS] = { +#include "scalars_4x.inc" + }; + + // + + for (j = 6; j >= 0; j--) { + if (j < 6) { + PQCLEAN_MCELIECE8192128F_AVX_vec256_mul(in, in, s[j]); // scaling + } + + for (k = j; k <= 4; k++) { + for (i = 0; i < GFBITS; i++) { + t = PQCLEAN_MCELIECE8192128F_AVX_vec256_and(in[i], mask[k][0]); + t = PQCLEAN_MCELIECE8192128F_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(in[i], t); + + t = PQCLEAN_MCELIECE8192128F_AVX_vec256_and(in[i], mask[k][1]); + t = PQCLEAN_MCELIECE8192128F_AVX_vec256_sll_4x(t, 1 << k); + in[i] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(in[i], t); + } + } + + if (j <= 5) { + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 3); + + v[1] ^= v[0] >> 32; + v[1] ^= v[1] << 32; + v[3] ^= v[2] >> 32; + v[3] ^= v[3] << 32; + + in[i] = PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + } + + for (i = 0; i < GFBITS; i++) { + v[0] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 0); + v[1] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 1); + v[2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 2); + v[3] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i], 3); + + v[2] ^= v[1]; + v[3] ^= v[2]; + + in[i] = PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } + + } +} + +static void butterflies_tr(vec256 *out, vec256 in[][ GFBITS ]) { + int i, j, k, s, b; + + vec256 t0[ GFBITS ]; + vec256 t1[ GFBITS ]; + vec256 t; + + vec128 out128[ GFBITS ][ 2 ]; + vec128 tmp[ GFBITS ]; + + union { + vec128 v[6][ GFBITS + 1 ]; + vec256 V[6][ (GFBITS + 1) / 2 ]; + } pre; + + union { + vec128 v[64][ 2 ]; + vec256 V[64]; + } buf; + + const vec256 consts[ 33 ][ GFBITS ] = { +#include "consts.inc" + }; + + uint64_t v[4]; + uint64_t consts_ptr = 33; + + const unsigned char reversal[] = { + 0, 32, 16, 48, 8, 40, 24, 56, + 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, + 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, + 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, + 7, 39, 23, 55, 15, 47, 31, 63 + }; + + const uint16_t beta[6] = {5246, 5306, 6039, 6685, 4905, 6755}; + + // butterflies + + for (i = 4; i >= 0; i--) { + s = 1 << i; + consts_ptr -= s; + + for (j = 0; j < 32; j += 2 * s) { + for (k = j; k < j + s; k++) { + PQCLEAN_MCELIECE8192128F_AVX_vec256_ama_asm(in[k], in[k + s], consts[ consts_ptr + (k - j) ]); + } + } + + } + + for (k = 0; k < 32; k += 2) { + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_low(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_high(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE8192128F_AVX_vec256_ama_asm(t0, t1, consts[1]); + + for (b = 0; b < GFBITS; b++) { + in[k][b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_low(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_high(t0[b], t1[b]); + } + + for (b = 0; b < GFBITS; b++) { + t0[b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_low_2x(in[k][b], in[k + 1][b]); + } + for (b = 0; b < GFBITS; b++) { + t1[b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_high_2x(in[k][b], in[k + 1][b]); + } + + PQCLEAN_MCELIECE8192128F_AVX_vec256_ama_asm(t0, t1, consts[0]); + + for (b = 0; b < GFBITS; b++) { + in[k + 0][b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_low_2x(t0[b], t1[b]); + } + for (b = 0; b < GFBITS; b++) { + in[k + 1][b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_high_2x(t0[b], t1[b]); + } + } + + + // boradcast + + for (i = 0; i < GFBITS; i += 2) { + // transpose + + for (k = 0; k < 32; k++) { + if (i != GFBITS - 1) { + buf.v[ reversal[2 * k + 0] ][1] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract2x(in[ k ][i + 1], 0); + buf.v[ reversal[2 * k + 1] ][1] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract2x(in[ k ][i + 1], 1); + } + + buf.v[ reversal[2 * k + 0] ][0] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract2x(in[ k ][i + 0], 0); + buf.v[ reversal[2 * k + 1] ][0] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract2x(in[ k ][i + 0], 1); + } + + PQCLEAN_MCELIECE8192128F_AVX_transpose_64x256_sp(buf.V); + + // + + pre.V[0][i / 2] = buf.V[32]; + buf.V[33] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[33], buf.V[32]); + pre.V[1][i / 2] = buf.V[33]; + buf.V[35] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[35], buf.V[33]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[35]); + buf.V[34] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[34], buf.V[35]); + pre.V[2][i / 2] = buf.V[34]; + buf.V[38] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[38], buf.V[34]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[38]); + buf.V[39] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[39], buf.V[38]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[39]); + buf.V[37] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[37], buf.V[39]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[37]); + buf.V[36] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[36], buf.V[37]); + pre.V[3][i / 2] = buf.V[36]; + buf.V[44] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[44], buf.V[36]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[44]); + buf.V[45] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[45], buf.V[44]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[45]); + buf.V[47] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[47], buf.V[45]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[47]); + buf.V[46] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[46], buf.V[47]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[46]); + buf.V[42] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[42], buf.V[46]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[42]); + buf.V[43] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[43], buf.V[42]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[43]); + buf.V[41] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[41], buf.V[43]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[41]); + buf.V[40] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[40], buf.V[41]); + pre.V[4][i / 2] = buf.V[40]; + buf.V[56] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[56], buf.V[40]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[56]); + buf.V[57] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[57], buf.V[56]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[57]); + buf.V[59] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[59], buf.V[57]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[59]); + buf.V[58] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[58], buf.V[59]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[58]); + buf.V[62] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[62], buf.V[58]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[62]); + buf.V[63] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[63], buf.V[62]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[63]); + buf.V[61] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[61], buf.V[63]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[61]); + buf.V[60] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[60], buf.V[61]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[60]); + buf.V[52] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[52], buf.V[60]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[52]); + buf.V[53] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[53], buf.V[52]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[53]); + buf.V[55] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[55], buf.V[53]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[55]); + buf.V[54] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[54], buf.V[55]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[54]); + buf.V[50] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[50], buf.V[54]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[50]); + buf.V[51] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[51], buf.V[50]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[51]); + buf.V[49] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[49], buf.V[51]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[49]); + buf.V[48] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[48], buf.V[49]); + pre.V[5][i / 2] = buf.V[48]; + buf.V[16] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[16], buf.V[48]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[16]); + buf.V[17] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[17], buf.V[16]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[17]); + buf.V[19] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[19], buf.V[17]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[19]); + buf.V[18] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[18], buf.V[19]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[18]); + buf.V[22] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[22], buf.V[18]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[22]); + buf.V[23] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[23], buf.V[22]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[23]); + buf.V[21] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[21], buf.V[23]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[21]); + buf.V[20] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[20], buf.V[21]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[20]); + buf.V[28] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[28], buf.V[20]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[28]); + buf.V[29] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[29], buf.V[28]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[29]); + buf.V[31] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[31], buf.V[29]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[31]); + buf.V[30] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[30], buf.V[31]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[30]); + buf.V[26] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[26], buf.V[30]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[26]); + buf.V[27] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[27], buf.V[26]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[27]); + buf.V[25] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[25], buf.V[27]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[25]); + buf.V[24] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[24], buf.V[25]); + pre.V[4][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[4][i / 2], buf.V[24]); + buf.V[8] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[8], buf.V[24]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[8]); + buf.V[9] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[9], buf.V[8]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[9]); + buf.V[11] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[11], buf.V[9]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[11]); + buf.V[10] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[10], buf.V[11]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[10]); + buf.V[14] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[14], buf.V[10]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[14]); + buf.V[15] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[15], buf.V[14]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[15]); + buf.V[13] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[13], buf.V[15]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[13]); + buf.V[12] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[12], buf.V[13]); + pre.V[3][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[3][i / 2], buf.V[12]); + buf.V[4] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[4], buf.V[12]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[4]); + buf.V[5] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[5], buf.V[4]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[5]); + buf.V[7] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[7], buf.V[5]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[7]); + buf.V[6] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[6], buf.V[7]); + pre.V[2][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[2][i / 2], buf.V[6]); + buf.V[2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[2], buf.V[6]); + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[2]); + buf.V[3] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[3], buf.V[2]); + pre.V[1][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[1][i / 2], buf.V[3]); + buf.V[1] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[1], buf.V[3]); + + pre.V[0][i / 2] = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(pre.V[0][i / 2], buf.V[1]); + t = PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(buf.V[0], buf.V[1]); + + if (i != GFBITS - 1) { + out128[i + 1][0] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract2x(t, 1); + } + out128[i + 0][0] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract2x(t, 0); + + } + + // + + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_setbits((beta[0] >> j) & 1); + } + + PQCLEAN_MCELIECE8192128F_AVX_vec128_mul(tmp, pre.v[0], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = tmp[b]; + } + + for (i = 1; i < 6; i++) { + for (j = 0; j < GFBITS; j++) { + tmp[j] = PQCLEAN_MCELIECE8192128F_AVX_vec128_setbits((beta[i] >> j) & 1); + } + + PQCLEAN_MCELIECE8192128F_AVX_vec128_mul(tmp, pre.v[i], tmp); + + for (b = 0; b < GFBITS; b++) { + out128[b][1] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(out128[b][1], tmp[b]); + } + } + + for (b = 0; b < GFBITS; b++) { + v[0] = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(out128[b][0], 0); + v[1] = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(out128[b][0], 1); + v[2] = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(out128[b][1], 0); + v[3] = PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(out128[b][1], 1); + + out[b] = PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(v[0], v[1], v[2], v[3]); + } +} + +void PQCLEAN_MCELIECE8192128F_AVX_fft_tr(vec256 *out, vec256 in[][ GFBITS ]) { + butterflies_tr(out, in); + radix_conversions_tr(out); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/fft_tr.h b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/fft_tr.h new file mode 100644 index 0000000000..ea10712f1a --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/fft_tr.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_FFT_TR_H +#define PQCLEAN_MCELIECE8192128F_AVX_FFT_TR_H +/* + This file is for transpose of the Gao-Mateer FFT +*/ + + +#include "params.h" +#include "vec256.h" + +void PQCLEAN_MCELIECE8192128F_AVX_fft_tr(vec256 * /*out*/, vec256 /*in*/[][ GFBITS ]); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/gf.c b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/gf.c new file mode 100644 index 0000000000..b3cd9f205f --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/gf.c @@ -0,0 +1,236 @@ +/* + this file is for functions for field arithmetic +*/ + +#include "gf.h" + +#include "params.h" + +#include + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE8192128F_AVX_gf_mul2(gf a, gf b0, gf b1) { + int i; + + uint64_t tmp = 0; + uint64_t t0; + uint64_t t1; + uint64_t t; + uint64_t mask = 0x0000000100000001; + + t0 = a; + t1 = b1; + t1 = (t1 << 32) | b0; + + for (i = 0; i < GFBITS; i++) { + tmp ^= t0 * (t1 & mask); + mask += mask; + } + + // + + t = tmp & 0x01FF000001FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x0000E0000000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & 0x00001FFF00001FFF; +} + +/* field multiplication */ +gf PQCLEAN_MCELIECE8192128F_AVX_gf_mul(gf in0, gf in1) { + int i; + + uint64_t tmp; + uint64_t t0; + uint64_t t1; + uint64_t t; + + t0 = in0; + t1 = in1; + + tmp = t0 * (t1 & 1); + + for (i = 1; i < GFBITS; i++) { + tmp ^= (t0 * (t1 & ((uint64_t)1 << i))); + } + + // + + t = tmp & 0x1FF0000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + t = tmp & 0x000E000; + tmp ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + + return tmp & GFMASK; +} + +/* 2 field squarings */ +static inline gf gf_sq2(gf in) { + int i; + + const uint64_t B[] = {0x1111111111111111, + 0x0303030303030303, + 0x000F000F000F000F, + 0x000000FF000000FF + }; + + const uint64_t M[] = {0x0001FF0000000000, + 0x000000FF80000000, + 0x000000007FC00000, + 0x00000000003FE000 + }; + + uint64_t x = in; + uint64_t t; + + x = (x | (x << 24)) & B[3]; + x = (x | (x << 12)) & B[2]; + x = (x | (x << 6)) & B[1]; + x = (x | (x << 3)) & B[0]; + + for (i = 0; i < 4; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square and multiply */ +static inline gf gf_sqmul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x0000001FF0000000, + 0x000000000FF80000, + 0x000000000007E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 6) * (t0 & (1 << 6)); + + t0 ^= (t0 << 7); + + x ^= (t1 * (t0 & (0x04001))); + x ^= (t1 * (t0 & (0x08002))) << 1; + x ^= (t1 * (t0 & (0x10004))) << 2; + x ^= (t1 * (t0 & (0x20008))) << 3; + x ^= (t1 * (t0 & (0x40010))) << 4; + x ^= (t1 * (t0 & (0x80020))) << 5; + + for (i = 0; i < 3; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* square twice and multiply */ +static inline gf gf_sq2mul(gf in, gf m) { + int i; + + uint64_t x; + uint64_t t0; + uint64_t t1; + uint64_t t; + + const uint64_t M[] = {0x1FF0000000000000, + 0x000FF80000000000, + 0x000007FC00000000, + 0x00000003FE000000, + 0x0000000001FE0000, + 0x000000000001E000 + }; + + t0 = in; + t1 = m; + + x = (t1 << 18) * (t0 & (1 << 6)); + + t0 ^= (t0 << 21); + + x ^= (t1 * (t0 & (0x010000001))); + x ^= (t1 * (t0 & (0x020000002))) << 3; + x ^= (t1 * (t0 & (0x040000004))) << 6; + x ^= (t1 * (t0 & (0x080000008))) << 9; + x ^= (t1 * (t0 & (0x100000010))) << 12; + x ^= (t1 * (t0 & (0x200000020))) << 15; + + for (i = 0; i < 6; i++) { + t = x & M[i]; + x ^= (t >> 9) ^ (t >> 10) ^ (t >> 12) ^ (t >> 13); + } + + return x & GFMASK; +} + +/* return num/den */ +gf PQCLEAN_MCELIECE8192128F_AVX_gf_frac(gf den, gf num) { + gf tmp_11; + gf tmp_1111; + gf out; + + tmp_11 = gf_sqmul(den, den); // 11 + tmp_1111 = gf_sq2mul(tmp_11, tmp_11); // 1111 + out = gf_sq2(tmp_1111); + out = gf_sq2mul(out, tmp_1111); // 11111111 + out = gf_sq2(out); + out = gf_sq2mul(out, tmp_1111); // 111111111111 + + return gf_sqmul(out, num); // 1111111111110 +} + +/* return 1/den */ +gf PQCLEAN_MCELIECE8192128F_AVX_gf_inv(gf in) { + return PQCLEAN_MCELIECE8192128F_AVX_gf_frac(in, ((gf) 1)); +} + +/* check if a == 0 */ +gf PQCLEAN_MCELIECE8192128F_AVX_gf_iszero(gf a) { + uint32_t t = a; + + t -= 1; + t >>= 19; + + return (gf) t; +} + +/* multiplication in GF((2^m)^t) */ +void PQCLEAN_MCELIECE8192128F_AVX_GF_mul(gf *out, const gf *in0, const gf *in1) { + int i, j; + + gf prod[255]; + + for (i = 0; i < 255; i++) { + prod[i] = 0; + } + + for (i = 0; i < 128; i++) { + for (j = 0; j < 128; j++) { + prod[i + j] ^= PQCLEAN_MCELIECE8192128F_AVX_gf_mul(in0[i], in1[j]); + } + } + + // + + for (i = 254; i >= 128; i--) { + prod[i - 123] ^= PQCLEAN_MCELIECE8192128F_AVX_gf_mul(prod[i], (gf) 7682); + prod[i - 125] ^= PQCLEAN_MCELIECE8192128F_AVX_gf_mul(prod[i], (gf) 2159); + prod[i - 128] ^= PQCLEAN_MCELIECE8192128F_AVX_gf_mul(prod[i], (gf) 3597); + } + + for (i = 0; i < 128; i++) { + out[i] = prod[i]; + } +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/gf.h b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/gf.h new file mode 100644 index 0000000000..2d58e7d36b --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/gf.h @@ -0,0 +1,25 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_GF_H +#define PQCLEAN_MCELIECE8192128F_AVX_GF_H +/* + This file is for functions for field arithmetic +*/ + + +#include "params.h" + +#include + +typedef uint16_t gf; + +gf PQCLEAN_MCELIECE8192128F_AVX_gf_iszero(gf /*a*/); +gf PQCLEAN_MCELIECE8192128F_AVX_gf_mul(gf /*in0*/, gf /*in1*/); +gf PQCLEAN_MCELIECE8192128F_AVX_gf_frac(gf /*den*/, gf /*num*/); +gf PQCLEAN_MCELIECE8192128F_AVX_gf_inv(gf /*in*/); + +void PQCLEAN_MCELIECE8192128F_AVX_GF_mul(gf * /*out*/, const gf * /*in0*/, const gf * /*in1*/); + +/* 2 field multiplications */ +uint64_t PQCLEAN_MCELIECE8192128F_AVX_gf_mul2(gf a, gf b0, gf b1); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/int32_sort.c b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/int32_sort.c new file mode 100644 index 0000000000..a35e886e20 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/int32_sort.c @@ -0,0 +1,1211 @@ +#include "int32_sort.h" +#define int32 int32_t + +#include + +typedef __m256i int32x8; +#define int32x8_load(z) _mm256_loadu_si256((__m256i_u *) (z)) +#define int32x8_store(z,i) _mm256_storeu_si256((__m256i_u *) (z),(i)) +#define int32x8_min _mm256_min_epi32 +#define int32x8_max _mm256_max_epi32 + +#define int32x8_MINMAX(a,b) \ + do { \ + int32x8 c = int32x8_min(a, b); \ + (b) = int32x8_max(a,b); \ + (a) = c; \ + } while(0) + +static inline void int32_MINMAX(int32 *a, int32 *b) { + int32 ab = *b ^ *a; + int32 c = *b - *a; + c ^= ab & (c ^ *b); + c >>= 31; + c &= ab; + *a ^= c; + *b ^= c; +} + +static void minmax_vector(int32 *x, int32 *y, size_t n) { + if (n < 8) { + while (n > 0) { + int32_MINMAX(x, y); + ++x; + ++y; + --n; + } + return; + } + if (n & 7) { + int32x8 x0 = int32x8_load(x + n - 8); + int32x8 y0 = int32x8_load(y + n - 8); + int32x8_MINMAX(x0, y0); + int32x8_store(x + n - 8, x0); + int32x8_store(y + n - 8, y0); + n &= ~7; + } + do { + int32x8 x0 = int32x8_load(x); + int32x8 y0 = int32x8_load(y); + int32x8_MINMAX(x0, y0); + int32x8_store(x, x0); + int32x8_store(y, y0); + x += 8; + y += 8; + n -= 8; + } while (n); +} + +/* stages 8,4,2,1 of size-16 bitonic merging */ +static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) { + int32x8 b0, b1, c0, c1, mask; + + int32x8_MINMAX(x0, x1); + + b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + + int32x8_MINMAX(b0, b1); + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */ + + x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */ + x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */ + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + } + + int32x8_store(&x[0], x0); + int32x8_store(&x[8], x1); +} + +/* stages 64,32 of bitonic merging; n is multiple of 128 */ +static void int32_twostages_32(int32 *x, size_t n) { + size_t i; + + while (n > 0) { + for (i = 0; i < 32; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + 32]); + int32x8 x2 = int32x8_load(&x[i + 64]); + int32x8 x3 = int32x8_load(&x[i + 96]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 32], x1); + int32x8_store(&x[i + 64], x2); + int32x8_store(&x[i + 96], x3); + } + x += 128; + n -= 128; + } +} + +/* stages 4q,2q,q of bitonic merging */ +static size_t int32_threestages(int32 *x, size_t n, size_t q) { + size_t k, i; + + for (k = 0; k + 8 * q <= n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + + return k; +} + +/* n is a power of 2; n >= 8; if n == 8 then flagdown */ +// NOLINTNEXTLINE(google-readability-function-size) +static void int32_sort_2power(int32 *x, size_t n, int flagdown) { + size_t p, q, i, j, k; + int32x8 mask; + + if (n == 8) { + int32 x0 = x[0]; + int32 x1 = x[1]; + int32 x2 = x[2]; + int32 x3 = x[3]; + int32 x4 = x[4]; + int32 x5 = x[5]; + int32 x6 = x[6]; + int32 x7 = x[7]; + + /* odd-even sort instead of bitonic sort */ + + int32_MINMAX(&x1, &x0); + int32_MINMAX(&x3, &x2); + int32_MINMAX(&x2, &x0); + int32_MINMAX(&x3, &x1); + int32_MINMAX(&x2, &x1); + + int32_MINMAX(&x5, &x4); + int32_MINMAX(&x7, &x6); + int32_MINMAX(&x6, &x4); + int32_MINMAX(&x7, &x5); + int32_MINMAX(&x6, &x5); + + int32_MINMAX(&x4, &x0); + int32_MINMAX(&x6, &x2); + int32_MINMAX(&x4, &x2); + + int32_MINMAX(&x5, &x1); + int32_MINMAX(&x7, &x3); + int32_MINMAX(&x5, &x3); + + int32_MINMAX(&x2, &x1); + int32_MINMAX(&x4, &x3); + int32_MINMAX(&x6, &x5); + + x[0] = x0; + x[1] = x1; + x[2] = x2; + x[3] = x3; + x[4] = x4; + x[5] = x5; + x[6] = x6; + x[7] = x7; + return; + } + + if (n == 16) { + int32x8 x0, x1, b0, b1, c0, c1; + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + + mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1); + + x0 ^= mask; /* A01234567 */ + x1 ^= mask; /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + c0 ^= mask; + c1 ^= mask; + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + b0 ^= mask; + b1 ^= mask; + + c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */ + c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */ + b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + mask = _mm256_set1_epi32(-1); + if (flagdown) { + x1 ^= mask; + } else { + x0 ^= mask; + } + + merge16_finish(x, x0, x1, flagdown); + return; + } + + if (n == 32) { + int32x8 x0, x1, x2, x3; + + int32_sort_2power(x, 16, 1); + int32_sort_2power(x + 16, 16, 0); + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + x2 = int32x8_load(&x[16]); + x3 = int32x8_load(&x[24]); + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + } + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + + merge16_finish(x, x0, x1, flagdown); + merge16_finish(x + 16, x2, x3, flagdown); + return; + } + + p = n >> 3; + for (i = 0; i < p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x2 = int32x8_load(&x[i + 2 * p]); + int32x8 x4 = int32x8_load(&x[i + 4 * p]); + int32x8 x6 = int32x8_load(&x[i + 6 * p]); + + /* odd-even stage instead of bitonic stage */ + + int32x8_MINMAX(x4, x0); + int32x8_MINMAX(x6, x2); + int32x8_MINMAX(x2, x0); + int32x8_MINMAX(x6, x4); + int32x8_MINMAX(x2, x4); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 2 * p], x2); + int32x8_store(&x[i + 4 * p], x4); + int32x8_store(&x[i + 6 * p], x6); + + int32x8 x1 = int32x8_load(&x[i + p]); + int32x8 x3 = int32x8_load(&x[i + 3 * p]); + int32x8 x5 = int32x8_load(&x[i + 5 * p]); + int32x8 x7 = int32x8_load(&x[i + 7 * p]); + + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x5, x3); + + int32x8_store(&x[i + p], x1); + int32x8_store(&x[i + 3 * p], x3); + int32x8_store(&x[i + 5 * p], x5); + int32x8_store(&x[i + 7 * p], x7); + } + + if (n >= 128) { + int flip, flipflip; + + mask = _mm256_set1_epi32(-1); + + for (j = 0; j < n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 16]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 16], x1); + } + + p = 8; + for (;;) { /* for p in [8, 16, ..., n/16] */ + q = p >> 1; + while (q >= 128) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + if (q == 64) { + int32_twostages_32(x, n); + q = 16; + } + if (q == 32) { + q = 8; + for (k = 0; k < n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q = 4; + } + if (q == 16) { + q = 8; + for (k = 0; k < n; k += 4 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q = 4; + } + if (q == 8) { + for (k = 0; k < n; k += q + q) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + } + } + + q = n >> 3; + flip = 0; + if (p << 1 == q) { + flip = 1; + } + flipflip = 1 - flip; + for (j = 0; j < q; j += p + p) { + for (k = j; k < j + p + p; k += p) { + for (i = k; i < k + p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + if (flip) { + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + x4 ^= mask; + x5 ^= mask; + x6 ^= mask; + x7 ^= mask; + } + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + flip ^= 1; + } + flip ^= flipflip; + } + + if (p << 4 == n) { + break; + } + p <<= 1; + } + } + + for (p = 4; p >= 1; p >>= 1) { + int32 *z = x; + int32 *target = x + n; + if (p == 4) { + mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&z[0], x0); + int32x8_store(&z[8], x1); + z += 16; + } + } else if (p == 2) { + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8_MINMAX(b0, b1); + int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20); + int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31); + int32x8_store(&z[0], c0); + int32x8_store(&z[8], c1); + z += 16; + } + } else { /* p == 1 */ + mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */ + int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */ + int32x8_MINMAX(d0, d1); + int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20); + int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31); + int32x8_store(&z[0], e0); + int32x8_store(&z[8], e1); + z += 16; + } + } + + q = n >> 4; + while (q >= 128 || q == 32) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (k = j; k < j + q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += 2 * q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (k = 0; k < q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + int32x8 x4 = int32x8_load(&x[k + 4 * q]); + int32x8 x5 = int32x8_load(&x[k + 5 * q]); + int32x8 x6 = int32x8_load(&x[k + 6 * q]); + int32x8 x7 = int32x8_load(&x[k + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + int32x8_store(&x[k + 4 * q], x4); + int32x8_store(&x[k + 5 * q], x5); + int32x8_store(&x[k + 6 * q], x6); + int32x8_store(&x[k + 7 * q], x7); + } + } + + /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */ + mask = _mm256_set1_epi32(-1); + + for (i = 0; i < n; i += 64) { + int32x8 a0 = int32x8_load(&x[i]); + int32x8 a1 = int32x8_load(&x[i + 8]); + int32x8 a2 = int32x8_load(&x[i + 16]); + int32x8 a3 = int32x8_load(&x[i + 24]); + int32x8 a4 = int32x8_load(&x[i + 32]); + int32x8 a5 = int32x8_load(&x[i + 40]); + int32x8 a6 = int32x8_load(&x[i + 48]); + int32x8 a7 = int32x8_load(&x[i + 56]); + + int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */ + int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */ + int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */ + int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */ + int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */ + int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */ + int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */ + + if (flagdown) { + c2 ^= mask; + c3 ^= mask; + c6 ^= mask; + c7 ^= mask; + } else { + c0 ^= mask; + c1 ^= mask; + c4 ^= mask; + c5 ^= mask; + } + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */ + int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */ + int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */ + + int32x8_MINMAX(d0, d1); + int32x8_MINMAX(d2, d3); + int32x8_MINMAX(d4, d5); + int32x8_MINMAX(d6, d7); + int32x8_MINMAX(d0, d2); + int32x8_MINMAX(d1, d3); + int32x8_MINMAX(d4, d6); + int32x8_MINMAX(d5, d7); + int32x8_MINMAX(d0, d4); + int32x8_MINMAX(d1, d5); + int32x8_MINMAX(d2, d6); + int32x8_MINMAX(d3, d7); + + int32x8 e0 = _mm256_unpacklo_epi32(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi32(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi32(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi32(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi32(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi32(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi32(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi32(d6, d7); + + int32x8 f0 = _mm256_unpacklo_epi64(e0, e2); + int32x8 f1 = _mm256_unpacklo_epi64(e1, e3); + int32x8 f2 = _mm256_unpackhi_epi64(e0, e2); + int32x8 f3 = _mm256_unpackhi_epi64(e1, e3); + int32x8 f4 = _mm256_unpacklo_epi64(e4, e6); + int32x8 f5 = _mm256_unpacklo_epi64(e5, e7); + int32x8 f6 = _mm256_unpackhi_epi64(e4, e6); + int32x8 f7 = _mm256_unpackhi_epi64(e5, e7); + + int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20); + int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20); + int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20); + int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20); + int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31); + int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31); + int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31); + int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31); + + int32x8_store(&x[i], g0); + int32x8_store(&x[i + 8], g1); + int32x8_store(&x[i + 16], g2); + int32x8_store(&x[i + 24], g3); + int32x8_store(&x[i + 32], g4); + int32x8_store(&x[i + 40], g5); + int32x8_store(&x[i + 48], g6); + int32x8_store(&x[i + 56], g7); + } + + q = n >> 4; + while (q >= 128 || q == 32) { + q >>= 2; + for (j = 0; j < n; j += 8 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q >>= 1; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += q + q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (i = 0; i < q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */ + int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */ + int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */ + int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */ + int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */ + int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */ + int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */ + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */ + int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */ + int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */ + + if (flagdown) { + d0 ^= mask; + d1 ^= mask; + d2 ^= mask; + d3 ^= mask; + d4 ^= mask; + d5 ^= mask; + d6 ^= mask; + d7 ^= mask; + } + + int32x8_store(&x[i], d0); + int32x8_store(&x[i + q], d4); + int32x8_store(&x[i + 2 * q], d1); + int32x8_store(&x[i + 3 * q], d5); + int32x8_store(&x[i + 4 * q], d2); + int32x8_store(&x[i + 5 * q], d6); + int32x8_store(&x[i + 6 * q], d3); + int32x8_store(&x[i + 7 * q], d7); + } +} + +void PQCLEAN_MCELIECE8192128F_AVX_int32_sort(int32 *x, size_t n) { + size_t q, i, j; + + if (n <= 8) { + if (n == 8) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + int32_MINMAX(&x[6], &x[7]); + } + if (n >= 7) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + } + if (n >= 6) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + } + if (n >= 5) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + } + if (n >= 4) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + } + if (n >= 3) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + } + if (n >= 2) { + int32_MINMAX(&x[0], &x[1]); + } + return; + } + + if (!(n & (n - 1))) { + int32_sort_2power(x, n, 0); + return; + } + + q = 8; + while (q < n - q) { + q += q; + } + /* n > q >= 8 */ + + if (q <= 128) { /* n <= 256 */ + int32x8 y[32]; + for (i = q >> 3; i < q >> 2; ++i) { + y[i] = _mm256_set1_epi32(0x7fffffff); + } + for (i = 0; i < n; ++i) { + ((int32 *)y)[i] = x[i]; + } + int32_sort_2power((int32 *) y, 2 * q, 0); + for (i = 0; i < n; ++i) { + x[i] = ((int32 *) y)[i]; + } + return; + } + + int32_sort_2power(x, q, 1); + PQCLEAN_MCELIECE8192128F_AVX_int32_sort(x + q, n - q); + + while (q >= 64) { + q >>= 2; + j = int32_threestages(x, n, q); + minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j); + if (j + 4 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + j += 4 * q; + } + minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j); + if (j + 2 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + } + j += 2 * q; + } + minmax_vector(x + j, x + j + q, n - q - j); + q >>= 1; + } + if (q == 32) { + j = 0; + for (; j + 64 <= n; j += 64) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8 x4 = int32x8_load(&x[j + 32]); + int32x8 x5 = int32x8_load(&x[j + 40]); + int32x8 x6 = int32x8_load(&x[j + 48]); + int32x8 x7 = int32x8_load(&x[j + 56]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20); + int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31); + int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20); + int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8_MINMAX(a4, a5); + int32x8_MINMAX(a6, a7); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20); + int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31); + int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20); + int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8 c4 = _mm256_unpacklo_epi64(b4, b5); + int32x8 c5 = _mm256_unpackhi_epi64(b4, b5); + int32x8 c6 = _mm256_unpacklo_epi64(b6, b7); + int32x8 c7 = _mm256_unpackhi_epi64(b6, b7); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8_MINMAX(c4, c5); + int32x8_MINMAX(c6, c7); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 d4 = _mm256_unpacklo_epi32(c4, c5); + int32x8 d5 = _mm256_unpackhi_epi32(c4, c5); + int32x8 d6 = _mm256_unpacklo_epi32(c6, c7); + int32x8 d7 = _mm256_unpackhi_epi32(c6, c7); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi64(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi64(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi64(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi64(d6, d7); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8_MINMAX(e4, e5); + int32x8_MINMAX(e6, e7); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8 f4 = _mm256_unpacklo_epi32(e4, e5); + int32x8 f5 = _mm256_unpackhi_epi32(e4, e5); + int32x8 f6 = _mm256_unpacklo_epi32(e6, e7); + int32x8 f7 = _mm256_unpackhi_epi32(e6, e7); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + int32x8_store(&x[j + 32], f4); + int32x8_store(&x[j + 40], f5); + int32x8_store(&x[j + 48], f6); + int32x8_store(&x[j + 56], f7); + } + minmax_vector(x + j, x + j + 32, n - 32 - j); + goto continue16; + } + if (q == 16) { + j = 0; +continue16: + for (; j + 32 <= n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + } + minmax_vector(x + j, x + j + 16, n - 16 - j); + goto continue8; + } + /* q == 8 */ + j = 0; +continue8: + for (; j + 16 <= n; j += 16) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 8], x1); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */ + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */ + int32x8_MINMAX(a0, a1); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */ + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */ + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */ + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */ + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */ + int32x8_MINMAX(e0, e1); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */ + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */ + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + } + minmax_vector(x + j, x + j + 8, n - 8 - j); + if (j + 8 <= n) { + int32_MINMAX(&x[j], &x[j + 4]); + int32_MINMAX(&x[j + 1], &x[j + 5]); + int32_MINMAX(&x[j + 2], &x[j + 6]); + int32_MINMAX(&x[j + 3], &x[j + 7]); + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + int32_MINMAX(&x[j + 4], &x[j + 6]); + int32_MINMAX(&x[j + 5], &x[j + 7]); + int32_MINMAX(&x[j + 4], &x[j + 5]); + int32_MINMAX(&x[j + 6], &x[j + 7]); + j += 8; + } + minmax_vector(x + j, x + j + 4, n - 4 - j); + if (j + 4 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + j += 4; + } + if (j + 3 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + } + if (j + 2 <= n) { + int32_MINMAX(&x[j], &x[j + 1]); + } +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/int32_sort.h b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/int32_sort.h new file mode 100644 index 0000000000..10e286fc72 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/int32_sort.h @@ -0,0 +1,9 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_INT32_SORT_H +#define PQCLEAN_MCELIECE8192128F_AVX_INT32_SORT_H + +#include +#include + +void PQCLEAN_MCELIECE8192128F_AVX_int32_sort(int32_t *x, size_t n); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/operations.c b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/operations.c new file mode 100644 index 0000000000..ce461ca97c --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/operations.c @@ -0,0 +1,136 @@ +#include "api.h" + +#include "aes256ctr.h" +#include "controlbits.h" +#include "crypto_hash.h" +#include "decrypt.h" +#include "encrypt.h" +#include "params.h" +#include "pk_gen.h" +#include "randombytes.h" +#include "sk_gen.h" +#include "util.h" + +#include +#include + +int PQCLEAN_MCELIECE8192128F_AVX_crypto_kem_enc( + uint8_t *c, + uint8_t *key, + const uint8_t *pk +) { + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t one_ec[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ] = {1}; + + PQCLEAN_MCELIECE8192128F_AVX_encrypt(c, e, pk); + + crypto_hash_32b(c + SYND_BYTES, two_e, sizeof(two_e)); + + memcpy(one_ec + 1, e, SYS_N / 8); + memcpy(one_ec + 1 + SYS_N / 8, c, SYND_BYTES + 32); + + crypto_hash_32b(key, one_ec, sizeof(one_ec)); + + return 0; +} + +int PQCLEAN_MCELIECE8192128F_AVX_crypto_kem_dec( + uint8_t *key, + const uint8_t *c, + const uint8_t *sk +) { + int i; + + uint8_t ret_confirm = 0; + uint8_t ret_decrypt = 0; + + uint16_t m; + + uint8_t conf[32]; + uint8_t two_e[ 1 + SYS_N / 8 ] = {2}; + uint8_t *e = two_e + 1; + uint8_t preimage[ 1 + SYS_N / 8 + (SYND_BYTES + 32) ]; + uint8_t *x = preimage; + + // + + ret_decrypt = (uint8_t)PQCLEAN_MCELIECE8192128F_AVX_decrypt(e, sk + SYS_N / 8, c); + + crypto_hash_32b(conf, two_e, sizeof(two_e)); + + for (i = 0; i < 32; i++) { + ret_confirm |= conf[i] ^ c[SYND_BYTES + i]; + } + + m = ret_decrypt | ret_confirm; + m -= 1; + m >>= 8; + + *x++ = (~m & 0) | (m & 1); + for (i = 0; i < SYS_N / 8; i++) { + *x++ = (~m & sk[i]) | (m & e[i]); + } + for (i = 0; i < SYND_BYTES + 32; i++) { + *x++ = c[i]; + } + + crypto_hash_32b(key, preimage, sizeof(preimage)); + + return 0; +} + +int PQCLEAN_MCELIECE8192128F_AVX_crypto_kem_keypair +( + uint8_t *pk, + uint8_t *sk +) { + int i; + uint8_t seed[ 32 ]; + uint8_t r[ SYS_T * 2 + (1 << GFBITS)*sizeof(uint32_t) + SYS_N / 8 + 32 ]; + uint8_t nonce[ 16 ] = {0}; + uint8_t *rp; + + gf f[ SYS_T ]; // element in GF(2^mt) + gf irr[ SYS_T ]; // Goppa polynomial + uint32_t perm[ 1 << GFBITS ]; // random permutation + + randombytes(seed, sizeof(seed)); + + while (1) { + rp = r; + PQCLEAN_MCELIECE8192128F_AVX_aes256ctr(r, sizeof(r), nonce, seed); + memcpy(seed, &r[ sizeof(r) - 32 ], 32); + + for (i = 0; i < SYS_T; i++) { + f[i] = PQCLEAN_MCELIECE8192128F_AVX_load2(rp + i * 2); + } + rp += sizeof(f); + if (PQCLEAN_MCELIECE8192128F_AVX_genpoly_gen(irr, f)) { + continue; + } + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = PQCLEAN_MCELIECE8192128F_AVX_load4(rp + i * 4); + } + rp += sizeof(perm); + if (PQCLEAN_MCELIECE8192128F_AVX_perm_check(perm)) { + continue; + } + + for (i = 0; i < SYS_T; i++) { + PQCLEAN_MCELIECE8192128F_AVX_store2(sk + SYS_N / 8 + i * 2, irr[i]); + } + if (PQCLEAN_MCELIECE8192128F_AVX_pk_gen(pk, perm, sk + SYS_N / 8)) { + continue; + } + + memcpy(sk, rp, SYS_N / 8); + PQCLEAN_MCELIECE8192128F_AVX_controlbits(sk + SYS_N / 8 + IRR_BYTES, perm); + + break; + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/params.h b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/params.h new file mode 100644 index 0000000000..7e7766d6da --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/params.h @@ -0,0 +1,21 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_PARAMS_H +#define PQCLEAN_MCELIECE8192128F_AVX_PARAMS_H + +#define GFBITS 13 +#define SYS_N 8192 +#define SYS_T 128 + +#define COND_BYTES ((1 << (GFBITS-4))*(2*GFBITS - 1)) +#define IRR_BYTES (SYS_T * 2) + +#define PK_NROWS (SYS_T*GFBITS) +#define PK_NCOLS (SYS_N - PK_NROWS) +#define PK_ROW_BYTES ((PK_NCOLS + 7)/8) + +#define SK_BYTES (SYS_N/8 + IRR_BYTES + COND_BYTES) +#define SYND_BYTES ((PK_NROWS + 7)/8) + +#define GFMASK ((1 << GFBITS) - 1) + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/pk_gen.c b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/pk_gen.c new file mode 100644 index 0000000000..9d222aba72 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/pk_gen.c @@ -0,0 +1,355 @@ +/* + This file is for public-key generation +*/ + +#include "pk_gen.h" + +#include "benes.h" +#include "controlbits.h" +#include "fft.h" +#include "params.h" +#include "util.h" + +#include + +#define min(a, b) (((a) < (b)) ? (a) : (b)) + +static void de_bitslicing(uint64_t *out, vec256 in[][GFBITS]) { + int i, j, r; + uint64_t u = 0; + + for (i = 0; i < (1 << GFBITS); i++) { + out[i] = 0 ; + } + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + u = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i][j], 0); + for (r = 0; r < 64; r++) { + out[i * 256 + 0 * 64 + r] <<= 1; + out[i * 256 + 0 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i][j], 1); + for (r = 0; r < 64; r++) { + out[i * 256 + 1 * 64 + r] <<= 1; + out[i * 256 + 1 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i][j], 2); + for (r = 0; r < 64; r++) { + out[i * 256 + 2 * 64 + r] <<= 1; + out[i * 256 + 2 * 64 + r] |= (u >> r) & 1; + } + u = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(in[i][j], 3); + for (r = 0; r < 64; r++) { + out[i * 256 + 3 * 64 + r] <<= 1; + out[i * 256 + 3 * 64 + r] |= (u >> r) & 1; + } + } + } +} + +static void to_bitslicing_2x(vec256 out0[][GFBITS], vec256 out1[][GFBITS], const uint64_t *in) { + int i, j, k, r; + uint64_t u[4] = {0}; + + for (i = 0; i < 32; i++) { + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> (j + GFBITS)) & 1; + } + } + + out1[i][j] = PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + + for (j = GFBITS - 1; j >= 0; j--) { + for (k = 0; k < 4; k++) { + for (r = 63; r >= 0; r--) { + u[k] <<= 1; + u[k] |= (in[i * 256 + k * 64 + r] >> j) & 1; + } + } + + out0[i][GFBITS - 1 - j] = PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(u[0], u[1], u[2], u[3]); + } + } +} + +static void transpose_64x64(uint64_t *out, const uint64_t *in) { + int i, j, s, d; + + uint64_t x, y; + uint64_t masks[6][2] = { + {0x5555555555555555, 0xAAAAAAAAAAAAAAAA}, + {0x3333333333333333, 0xCCCCCCCCCCCCCCCC}, + {0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0}, + {0x00FF00FF00FF00FF, 0xFF00FF00FF00FF00}, + {0x0000FFFF0000FFFF, 0xFFFF0000FFFF0000}, + {0x00000000FFFFFFFF, 0xFFFFFFFF00000000} + }; + + for (i = 0; i < 64; i++) { + out[i] = in[i]; + } + + for (d = 5; d >= 0; d--) { + s = 1 << d; + + for (i = 0; i < 64; i += s * 2) { + for (j = i; j < i + s; j++) { + x = (out[j] & masks[d][0]) | ((out[j + s] & masks[d][0]) << s); + y = ((out[j] & masks[d][1]) >> s) | (out[j + s] & masks[d][1]); + + out[j + 0] = x; + out[j + s] = y; + } + } + } +} + +/* return number of trailing zeros of the non-zero input in */ +static inline int ctz(uint64_t in) { + return (int)_tzcnt_u64(in); +} + +static inline uint64_t same_mask(uint16_t x, uint16_t y) { + uint64_t mask; + + mask = x ^ y; + mask -= 1; + mask >>= 63; + mask = -mask; + + return mask; +} + +static int mov_columns(uint64_t mat[][ 128 ], uint32_t *perm) { + int i, j, k, s, block_idx, row; + uint64_t buf[64], ctz_list[32], t, d, mask; + + row = GFBITS * SYS_T - 32; + block_idx = row / 64; + + // extract the 32x64 matrix + + for (i = 0; i < 32; i++) { + buf[i] = (mat[ row + i ][ block_idx + 0 ] >> 32) | + (mat[ row + i ][ block_idx + 1 ] << 32); + } + + // compute the column indices of pivots by Gaussian elimination. + // the indices are stored in ctz_list + + for (i = 0; i < 32; i++) { + t = buf[i]; + for (j = i + 1; j < 32; j++) { + t |= buf[j]; + } + + if (t == 0) { + return -1; // return if buf is not full rank + } + + ctz_list[i] = s = ctz(t); + + for (j = i + 1; j < 32; j++) { + mask = (buf[i] >> s) & 1; + mask -= 1; + buf[i] ^= buf[j] & mask; + } + for (j = 0; j < i; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + for (j = i + 1; j < 32; j++) { + mask = (buf[j] >> s) & 1; + mask = -mask; + buf[j] ^= buf[i] & mask; + } + } + + // updating permutation + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = perm[ row + j ] ^ perm[ row + k ]; + d &= same_mask(k, ctz_list[j]); + perm[ row + j ] ^= d; + perm[ row + k ] ^= d; + } + } + + // moving columns of mat according to the column indices of pivots + + for (i = 0; i < GFBITS * SYS_T; i += 64) { + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + buf[j] = (mat[ i + j ][ block_idx + 0 ] >> 32) | + (mat[ i + j ][ block_idx + 1 ] << 32); + } + + transpose_64x64(buf, buf); + + for (j = 0; j < 32; j++) { + for (k = j + 1; k < 64; k++) { + d = buf[ j ] ^ buf[ k ]; + d &= same_mask(k, ctz_list[j]); + buf[ j ] ^= d; + buf[ k ] ^= d; + } + } + + transpose_64x64(buf, buf); + + for (j = 0; j < min(64, GFBITS * SYS_T - i); j++) { + mat[ i + j ][ block_idx + 0 ] = (mat[ i + j ][ block_idx + 0 ] & 0x00000000FFFFFFFF) | (buf[j] << 32); + mat[ i + j ][ block_idx + 1 ] = (mat[ i + j ][ block_idx + 1 ] & 0xFFFFFFFF00000000) | (buf[j] >> 32); + } + } + + return 0; +} + + +#define NBLOCKS2_H ((SYS_N + 255) / 256) +#define NBLOCKS1_I ((GFBITS * SYS_T + 63) / 64) +int PQCLEAN_MCELIECE8192128F_AVX_pk_gen(unsigned char *pk, uint32_t *perm, const unsigned char *sk) { + int i, j, k; + int row, c; + + uint64_t mat[ GFBITS * SYS_T ][ 128 ]; + + uint64_t mask; + + vec128 sk_int[ GFBITS ]; + + vec256 consts[ 32 ][ GFBITS ]; + vec256 eval[ 32 ][ GFBITS ]; + vec256 prod[ 32 ][ GFBITS ]; + vec256 tmp[ GFBITS ]; + + uint64_t list[1 << GFBITS]; + + // compute the inverses + + PQCLEAN_MCELIECE8192128F_AVX_irr_load(sk_int, sk); + + PQCLEAN_MCELIECE8192128F_AVX_fft(eval, sk_int); + + PQCLEAN_MCELIECE8192128F_AVX_vec256_copy(prod[0], eval[0]); + + for (i = 1; i < 32; i++) { + PQCLEAN_MCELIECE8192128F_AVX_vec256_mul(prod[i], prod[i - 1], eval[i]); + } + + PQCLEAN_MCELIECE8192128F_AVX_vec256_inv(tmp, prod[31]); + + for (i = 30; i >= 0; i--) { + PQCLEAN_MCELIECE8192128F_AVX_vec256_mul(prod[i + 1], prod[i], tmp); + PQCLEAN_MCELIECE8192128F_AVX_vec256_mul(tmp, tmp, eval[i + 1]); + } + + PQCLEAN_MCELIECE8192128F_AVX_vec256_copy(prod[0], tmp); + + // fill matrix + + de_bitslicing(list, prod); + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] <<= GFBITS; + list[i] |= i; + list[i] |= ((uint64_t) perm[i]) << 31; + } + + PQCLEAN_MCELIECE8192128F_AVX_sort_63b(1 << GFBITS, list); + + to_bitslicing_2x(consts, prod, list); + + for (i = 0; i < (1 << GFBITS); i++) { + perm[i] = list[i] & GFMASK; + } + + for (j = 0; j < NBLOCKS2_H; j++) { + for (k = 0; k < GFBITS; k++) { + mat[ k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + + for (i = 1; i < SYS_T; i++) { + for (j = 0; j < NBLOCKS2_H; j++) { + PQCLEAN_MCELIECE8192128F_AVX_vec256_mul(prod[j], prod[j], consts[j]); + + for (k = 0; k < GFBITS; k++) { + mat[ i * GFBITS + k ][ 4 * j + 0 ] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(prod[ j ][ k ], 0); + mat[ i * GFBITS + k ][ 4 * j + 1 ] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(prod[ j ][ k ], 1); + mat[ i * GFBITS + k ][ 4 * j + 2 ] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(prod[ j ][ k ], 2); + mat[ i * GFBITS + k ][ 4 * j + 3 ] = PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(prod[ j ][ k ], 3); + } + } + } + + // gaussian elimination + + for (i = 0; i < (GFBITS * SYS_T) / 64; i++) { + for (j = 0; j < 64; j++) { + row = i * 64 + j; + + if (row == GFBITS * SYS_T - 32) { + if (mov_columns(mat, perm)) { + return -1; + } + } + + for (k = row + 1; k < PK_NROWS; k++) { + mask = mat[ row ][ i ] >> j; + mask &= 1; + mask -= 1; + + for (c = 0; c < 128; c++) { + mat[ row ][ c ] ^= mat[ k ][ c ] & mask; + } + } + + if ( ((mat[ row ][ i ] >> j) & 1) == 0 ) { // return if not systematic + return -1; + } + + for (k = 0; k < row; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < 128; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + for (k = row + 1; k < GFBITS * SYS_T; k++) { + mask = mat[ k ][ i ] >> j; + mask &= 1; + mask = -mask; + + for (c = 0; c < 128; c++) { + mat[ k ][ c ] ^= mat[ row ][ c ] & mask; + } + } + } + } + + for (i = 0; i < GFBITS * SYS_T; i++) { + for (j = NBLOCKS1_I; j < 128; j++) { + PQCLEAN_MCELIECE8192128F_AVX_store8(pk, mat[i][j]); + pk += 8; + } + } + + // + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/pk_gen.h b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/pk_gen.h new file mode 100644 index 0000000000..5620a7615f --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/pk_gen.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_PK_GEN_H +#define PQCLEAN_MCELIECE8192128F_AVX_PK_GEN_H +/* + This file is for public-key generation +*/ + +#include + +int PQCLEAN_MCELIECE8192128F_AVX_pk_gen(unsigned char * /*pk*/, uint32_t * /*perm*/, const unsigned char * /*sk*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/powers.inc b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/powers.inc new file mode 100644 index 0000000000..33a3a60088 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/powers.inc @@ -0,0 +1,480 @@ +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0, 0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3, 0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5, 0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A, 0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000, 0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33, 0XCC33CC33CC33CC33), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X9696969669696969, 0X9696969669696969, 0X9696969669696969, 0X9696969669696969), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5, 0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA55AA55A5AA55AA5, 0XA55AA55A5AA55AA5, 0X5AA55AA5A55AA55A, 0X5AA55AA5A55AA55A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC33CC33C3CC33CC3, 0X3CC33CC3C33CC33C, 0X3CC33CC3C33CC33C, 0XC33CC33C3CC33CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A, 0XA5A55A5AA5A55A5A), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0X0000FFFF0000FFFF, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC, 0X33CC33CC33CC33CC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F, 0XF00FF00F0FF00FF0, 0X0FF00FF0F00FF00F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X6969696996969696, 0X6969696996969696, 0X6969696996969696, 0X6969696996969696), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5A5A5A5A5A5A5A5A, 0X5A5A5A5A5A5A5A5A, 0XA5A5A5A5A5A5A5A5, 0XA5A5A5A5A5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XA5A5A5A55A5A5A5A, 0XA5A5A5A55A5A5A5A, 0X5A5A5A5AA5A5A5A5, 0X5A5A5A5AA5A5A5A5), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X5555AAAAAAAA5555, 0XAAAA55555555AAAA, 0X5555AAAAAAAA5555, 0XAAAA55555555AAAA), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0, 0XF00FF00FF00FF00F, 0X0FF00FF00FF00FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0F0FF0F00F0FF0F0, 0XF0F00F0FF0F00F0F, 0XF0F00F0FF0F00F0F, 0X0F0FF0F00F0FF0F0) +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/scalars_2x.inc b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/scalars_2x.inc new file mode 100644 index 0000000000..5d664f2c5c --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/scalars_2x.inc @@ -0,0 +1,75 @@ +{ + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFF0000F0FF000F0, 0XFF000000FFF00000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X00F000F0FFF00F00, 0X00F00F00F00F000F), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X00FF0000000000FF, 0X00FF00FF00FF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFF0000FF00FF0000, 0X0000FFFF000000FF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFF000000FF0000, 0X00FFFF00FF000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X00FFFFFFFF00FF00, 0X0000FF0000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFF000000000000, 0XFF00FFFF00FFFF00), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X00FF0000FF000000, 0X00FFFF00000000FF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFF0000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFF000000000000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000) +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/scalars_4x.inc b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/scalars_4x.inc new file mode 100644 index 0000000000..75bbc87cf2 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/scalars_4x.inc @@ -0,0 +1,91 @@ +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3C3CF30C0000C003, 0X0C0F0FCF0F0CF330, 0XF0F30C33CF03F03F, 0X3F30CC0C000F3FCC), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0CCCC3F333C0000C, 0XF0000FC33C3CCF3C, 0X00F30FC00C3300FF, 0XFC3CF030FC3FFF03), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X03C33F33FCC0C03C, 0X3C0F3F00C3C300FC, 0XF3CC3CF3F3FCF33F, 0X33FFFCFF0CCF3CC3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0003000F3C03C0C0, 0X3C33CCC0F0F3CC30, 0X3C0FC0FC303C3F3C, 0X003CFF33C3CC30CF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF33FF33030CF03F0, 0XC0CFFFFFCCCC30CC, 0XFC30CF303F3FF00F, 0XCFF3CF33C00F3003), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0CF0303300F0CCC0, 0X3FC3F3CCFFFC033F, 0X33300C0CC3300CF3, 0X00F3CC0CF3003CCF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF3F0C0CC0FF3CC0, 0XFC3030CCCCC0CFCF, 0X3C030CF3F03FF3F3, 0X3C000CFCCC3C3333), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XCF3CF0FF003FC000, 0X0FCF0C00CCF333C3, 0X3CCC03FCCC3FFC03, 0XF3CF03C0FCF03FF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XC00FF3CF0303F300, 0XCFFCF33000CFF030, 0X033C3C3CF0003FC3, 0X3F3C3CF0C330330C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3CCC0CC00CF0CC00, 0X00CFFCC330F30FCC, 0XFFC0FF00F0FF0F03, 0X33CCFCC0FF0033F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XF30FFC3C3FCCFC00, 0X3CCC3FCCC0F3FFF3, 0XF3F30CF003FCC303, 0X33C300C0F0C003F3), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3F0FC3F0CCF0C000, 0XF00F0C3FC003C0FF, 0X30CFCFC3CC0F3000, 0X003FF0003F00C00C), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X3000FF33CCF0F000, 0X330CCFCC03C0FC33, 0X0CF30CCF3FCFCC0F, 0XCFF3C3033F030FFF) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0F0F0FF0F000000F, 0XF0FFFFFFF0F00F00, 0X0F0F00FF0FF0FFFF, 0XFF0F0F00F000F0FF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00FFFFFFFF0000F0, 0X00FFF0FFFF0000FF, 0XF000F0F00F00FF0F, 0X0FFFFFFFFF00000F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF00FF00000F00, 0X00FF00000F0F0FFF, 0X000FFFF0FFF0FF0F, 0XF0FFFF000F00F0FF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFF000F00F0FF000, 0XF000F0000F00FF0F, 0X00F00FFF00000FF0, 0X0F0000F00FFF0FFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFF0000F0FF000F0, 0XFF000000FFF00000, 0XFFFFF0000FFFF00F, 0X0F0F0F00FF0F000F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00FF000FFF000000, 0XF0FF000FF00F0FF0, 0XFFF0FFF0000FFFF0, 0X000F0F0FFFF0F000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF0F0FFF0F0FF000, 0X0F0F0F00FF000F0F, 0XF0F0F0000F0F0F00, 0XF0FFFF0F00F0FF0F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0FFF0000000F0000, 0X0F0F00F0F0F0F000, 0X00F000F0F00FFF00, 0X0F0F000F0F00F0FF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00F000F0FFF00F00, 0X00F00F00F00F000F, 0XF0FF0F0FFF00F0FF, 0X0000F0FF00FF0F0F), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00F00FF00F00F000, 0X00F0F0F00000FFF0, 0XF0FF0FFFF0F0F0FF, 0X00FFFF0FF0FFF0F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFF000F000F00000, 0XFFFFFF0FF00F0FFF, 0X00FFFFFFFFFFFFF0, 0X0000000F00F0FFF0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00F00F000FF00000, 0X0F0FFFF00FFFFFFF, 0X00FFF0F0FF000F0F, 0XF0F00000FF00F0F0), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FF0F0000F000, 0XFFFF0F0FFF0FFF00, 0X000FFFF0000FFF00, 0X0F0F0FFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00FF0000000000FF, 0X00FF00FF00FF0000, 0XFFFF00FF00FF00FF, 0XFF0000FFFFFF00FF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFF00FF00, 0XFF00FFFF000000FF, 0X00FFFF000000FF00, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF0000FF00FF0000, 0X0000FFFF000000FF, 0XFFFF00FFFFFFFF00, 0XFFFF000000FFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF000000FF0000, 0X00FFFF00FF000000, 0X0000FFFF00FFFFFF, 0X00FFFF00FF0000FF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF00000000FF0000, 0XFFFFFF0000FF00FF, 0X00FF0000FF0000FF, 0XFFFFFF00FFFFFF00), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00FFFFFFFF000000, 0X0000FFFF00FFFF00, 0XFFFF0000FF00FFFF, 0X00FFFF00FFFF00FF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF0000FFFFFF0000, 0XFF00FF0000FFFF00, 0XFF000000FFFFFF00, 0X0000FFFF00FF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF00FF00FFFF0000, 0X00000000FFFFFFFF, 0X000000000000FFFF, 0X000000FFFF000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00FFFFFFFF00FF00, 0X0000FF0000000000, 0XFF00FF00FFFF0000, 0XFF00FF0000FF00FF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF000000000000, 0XFF00FFFF00FFFF00, 0XFFFF00FFFF00FFFF, 0X00FF0000000000FF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00FF0000FF000000, 0X00FFFF00000000FF, 0XFFFFFFFFFF00FF00, 0XFF00FFFF00FF00FF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFF00FF00FF000000, 0X0000FF00FF00FFFF, 0XFFFF00FFFF0000FF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00FF00FFFF000000, 0XFF0000FFFFFF0000, 0X0000FF00000000FF, 0X0000FF000000FFFF) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X000000000000FFFF, 0X0000FFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000FFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0XFFFFFFFF0000FFFF, 0XFFFFFFFF00000000, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFF000000000000, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000000000000000, 0X0000FFFF00000000, 0XFFFF0000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFF0000, 0X0000FFFF00000000, 0X00000000FFFF0000, 0X00000000FFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFF0000FFFF0000, 0X0000FFFFFFFFFFFF, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFFFFFF0000, 0X0000FFFFFFFF0000, 0X0000FFFFFFFFFFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF0000FFFF0000, 0X0000FFFF0000FFFF, 0XFFFFFFFF00000000, 0XFFFF00000000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000FFFF00000000, 0XFFFFFFFF0000FFFF, 0X000000000000FFFF, 0X0000FFFF0000FFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF000000000000, 0X00000000FFFF0000, 0X000000000000FFFF, 0X0000FFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFFFFFF0000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFF000000000000, 0XFFFF0000FFFFFFFF, 0XFFFFFFFF0000FFFF, 0X0000FFFFFFFF0000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFF0000FFFFFFFF, 0X0000FFFFFFFFFFFF) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X00000000FFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFF00000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X00000000FFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0XFFFFFFFF00000000, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFF00000000, 0X00000000FFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFF00000000, 0XFFFFFFFF00000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFF00000000) +}, +{ + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF, 0X0000000000000000), + PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(0X0000000000000000, 0X0000000000000000, 0XFFFFFFFFFFFFFFFF, 0XFFFFFFFFFFFFFFFF) +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/sk_gen.c b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/sk_gen.c new file mode 100644 index 0000000000..05f3bc1437 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/sk_gen.c @@ -0,0 +1,98 @@ +/* + This file is for secret-key generation +*/ + +#include "sk_gen.h" + +#include "controlbits.h" +#include "gf.h" +#include "params.h" +#include "util.h" + +/* input: f, element in GF((2^m)^t) */ +/* output: out, minimal polynomial of f */ +/* return: 0 for success and -1 for failure */ +int PQCLEAN_MCELIECE8192128F_AVX_genpoly_gen(gf *out, gf *f) { + int i, j, k, c; + + gf mat[ SYS_T + 1 ][ SYS_T ]; + gf mask, inv, t; + + // fill matrix + + mat[0][0] = 1; + + for (i = 1; i < SYS_T; i++) { + mat[0][i] = 0; + } + + for (i = 0; i < SYS_T; i++) { + mat[1][i] = f[i]; + } + + for (j = 2; j <= SYS_T; j++) { + PQCLEAN_MCELIECE8192128F_AVX_GF_mul(mat[j], mat[j - 1], f); + } + + // gaussian + + for (j = 0; j < SYS_T; j++) { + for (k = j + 1; k < SYS_T; k++) { + mask = PQCLEAN_MCELIECE8192128F_AVX_gf_iszero(mat[ j ][ j ]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] ^= mat[ c ][ k ] & mask; + } + + } + + if ( mat[ j ][ j ] == 0 ) { // return if not systematic + return -1; + } + + inv = PQCLEAN_MCELIECE8192128F_AVX_gf_inv(mat[j][j]); + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ j ] = PQCLEAN_MCELIECE8192128F_AVX_gf_mul(mat[ c ][ j ], inv) ; + } + + for (k = 0; k < SYS_T; k++) { + if (k != j) { + t = mat[ j ][ k ]; + + for (c = j; c < SYS_T + 1; c++) { + mat[ c ][ k ] ^= PQCLEAN_MCELIECE8192128F_AVX_gf_mul(mat[ c ][ j ], t); + } + } + } + } + + for (i = 0; i < SYS_T; i++) { + out[i] = mat[ SYS_T ][ i ]; + } + + return 0; +} + +/* input: permutation p represented as a list of 32-bit intergers */ +/* output: -1 if some interger repeats in p */ +/* 0 otherwise */ +int PQCLEAN_MCELIECE8192128F_AVX_perm_check(const uint32_t *p) { + int i; + uint64_t list[1 << GFBITS]; + + for (i = 0; i < (1 << GFBITS); i++) { + list[i] = p[i]; + } + + PQCLEAN_MCELIECE8192128F_AVX_sort_63b(1 << GFBITS, list); + + for (i = 1; i < (1 << GFBITS); i++) { + if (list[i - 1] == list[i]) { + return -1; + } + } + + return 0; +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/sk_gen.h b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/sk_gen.h new file mode 100644 index 0000000000..5db0e8612e --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/sk_gen.h @@ -0,0 +1,16 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_SK_GEN_H +#define PQCLEAN_MCELIECE8192128F_AVX_SK_GEN_H +/* + This file is for secret-key generation +*/ + + +#include "gf.h" + +#include + +int PQCLEAN_MCELIECE8192128F_AVX_genpoly_gen(gf * /*out*/, gf * /*f*/); +int PQCLEAN_MCELIECE8192128F_AVX_perm_check(const uint32_t * /*p*/); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/syndrome_asm.S b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/syndrome_asm.S new file mode 100644 index 0000000000..a7f2d65335 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/syndrome_asm.S @@ -0,0 +1,910 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 b64 + +# qhasm: int64 synd + +# qhasm: int64 addr + +# qhasm: int64 c + +# qhasm: int64 c_all + +# qhasm: int64 row + +# qhasm: int64 p + +# qhasm: int64 e + +# qhasm: int64 s + +# qhasm: reg256 pp + +# qhasm: reg256 ee + +# qhasm: reg256 ss + +# qhasm: int64 buf_ptr + +# qhasm: stack256 buf + +# qhasm: enter syndrome_asm +.p2align 5 +.global _PQCLEAN_MCELIECE8192128F_AVX_syndrome_asm +.global PQCLEAN_MCELIECE8192128F_AVX_syndrome_asm +_PQCLEAN_MCELIECE8192128F_AVX_syndrome_asm: +PQCLEAN_MCELIECE8192128F_AVX_syndrome_asm: +mov %rsp,%r11 +and $31,%r11 +add $32,%r11 +sub %r11,%rsp + +# qhasm: input_1 += 1357008 +# asm 1: add $1357008,buf_ptr=int64#4 +# asm 2: leaq buf_ptr=%rcx +leaq 0(%rsp),%rcx + +# qhasm: row = 1664 +# asm 1: mov $1664,>row=int64#5 +# asm 2: mov $1664,>row=%r8 +mov $1664,%r8 + +# qhasm: loop: +._loop: + +# qhasm: row -= 1 +# asm 1: sub $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rsi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 208 ] +# asm 1: vmovupd 208(ee=reg256#2 +# asm 2: vmovupd 208(ee=%ymm1 +vmovupd 208(%rdx),%ymm1 + +# qhasm: ss &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 32(pp=%ymm1 +vmovupd 32(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 240 ] +# asm 1: vmovupd 240(ee=reg256#3 +# asm 2: vmovupd 240(ee=%ymm2 +vmovupd 240(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 64(pp=%ymm1 +vmovupd 64(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 272 ] +# asm 1: vmovupd 272(ee=reg256#3 +# asm 2: vmovupd 272(ee=%ymm2 +vmovupd 272(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 96(pp=%ymm1 +vmovupd 96(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 304 ] +# asm 1: vmovupd 304(ee=reg256#3 +# asm 2: vmovupd 304(ee=%ymm2 +vmovupd 304(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 128(pp=%ymm1 +vmovupd 128(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 336 ] +# asm 1: vmovupd 336(ee=reg256#3 +# asm 2: vmovupd 336(ee=%ymm2 +vmovupd 336(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 160(pp=%ymm1 +vmovupd 160(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 368 ] +# asm 1: vmovupd 368(ee=reg256#3 +# asm 2: vmovupd 368(ee=%ymm2 +vmovupd 368(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 192(pp=%ymm1 +vmovupd 192(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 400 ] +# asm 1: vmovupd 400(ee=reg256#3 +# asm 2: vmovupd 400(ee=%ymm2 +vmovupd 400(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 224(pp=%ymm1 +vmovupd 224(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 432 ] +# asm 1: vmovupd 432(ee=reg256#3 +# asm 2: vmovupd 432(ee=%ymm2 +vmovupd 432(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 256(pp=%ymm1 +vmovupd 256(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 464 ] +# asm 1: vmovupd 464(ee=reg256#3 +# asm 2: vmovupd 464(ee=%ymm2 +vmovupd 464(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 288(pp=%ymm1 +vmovupd 288(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 496 ] +# asm 1: vmovupd 496(ee=reg256#3 +# asm 2: vmovupd 496(ee=%ymm2 +vmovupd 496(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 320(pp=%ymm1 +vmovupd 320(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 528 ] +# asm 1: vmovupd 528(ee=reg256#3 +# asm 2: vmovupd 528(ee=%ymm2 +vmovupd 528(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 352(pp=%ymm1 +vmovupd 352(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 560 ] +# asm 1: vmovupd 560(ee=reg256#3 +# asm 2: vmovupd 560(ee=%ymm2 +vmovupd 560(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 384(pp=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 592 ] +# asm 1: vmovupd 592(ee=reg256#3 +# asm 2: vmovupd 592(ee=%ymm2 +vmovupd 592(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 416(pp=%ymm1 +vmovupd 416(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 624 ] +# asm 1: vmovupd 624(ee=reg256#3 +# asm 2: vmovupd 624(ee=%ymm2 +vmovupd 624(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 448(pp=%ymm1 +vmovupd 448(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 656 ] +# asm 1: vmovupd 656(ee=reg256#3 +# asm 2: vmovupd 656(ee=%ymm2 +vmovupd 656(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 480(pp=%ymm1 +vmovupd 480(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 688 ] +# asm 1: vmovupd 688(ee=reg256#3 +# asm 2: vmovupd 688(ee=%ymm2 +vmovupd 688(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 512(pp=%ymm1 +vmovupd 512(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 720 ] +# asm 1: vmovupd 720(ee=reg256#3 +# asm 2: vmovupd 720(ee=%ymm2 +vmovupd 720(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 544(pp=%ymm1 +vmovupd 544(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 752 ] +# asm 1: vmovupd 752(ee=reg256#3 +# asm 2: vmovupd 752(ee=%ymm2 +vmovupd 752(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 576(pp=%ymm1 +vmovupd 576(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 784 ] +# asm 1: vmovupd 784(ee=reg256#3 +# asm 2: vmovupd 784(ee=%ymm2 +vmovupd 784(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 608(pp=%ymm1 +vmovupd 608(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 816 ] +# asm 1: vmovupd 816(ee=reg256#3 +# asm 2: vmovupd 816(ee=%ymm2 +vmovupd 816(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 640(pp=%ymm1 +vmovupd 640(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 848 ] +# asm 1: vmovupd 848(ee=reg256#3 +# asm 2: vmovupd 848(ee=%ymm2 +vmovupd 848(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 672(pp=%ymm1 +vmovupd 672(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 880 ] +# asm 1: vmovupd 880(ee=reg256#3 +# asm 2: vmovupd 880(ee=%ymm2 +vmovupd 880(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 704(pp=%ymm1 +vmovupd 704(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 912 ] +# asm 1: vmovupd 912(ee=reg256#3 +# asm 2: vmovupd 912(ee=%ymm2 +vmovupd 912(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 736(pp=%ymm1 +vmovupd 736(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 944 ] +# asm 1: vmovupd 944(ee=reg256#3 +# asm 2: vmovupd 944(ee=%ymm2 +vmovupd 944(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand pp=reg256#2 +# asm 2: vmovupd 768(pp=%ymm1 +vmovupd 768(%rsi),%ymm1 + +# qhasm: ee = mem256[ input_2 + 976 ] +# asm 1: vmovupd 976(ee=reg256#3 +# asm 2: vmovupd 976(ee=%ymm2 +vmovupd 976(%rdx),%ymm2 + +# qhasm: pp &= ee +# asm 1: vpand buf=stack256#1 +# asm 2: vmovapd buf=0(%rsp) +vmovapd %ymm0,0(%rsp) + +# qhasm: s = mem64[input_1 + 800] +# asm 1: movq 800(s=int64#6 +# asm 2: movq 800(s=%r9 +movq 800(%rsi),%r9 + +# qhasm: e = mem64[input_2 + 1008] +# asm 1: movq 1008(e=int64#7 +# asm 2: movq 1008(e=%rax +movq 1008(%rdx),%rax + +# qhasm: s &= e +# asm 1: and p=int64#7 +# asm 2: movq 808(p=%rax +movq 808(%rsi),%rax + +# qhasm: e = mem64[input_2 + 1016] +# asm 1: movq 1016(e=int64#8 +# asm 2: movq 1016(e=%r10 +movq 1016(%rdx),%r10 + +# qhasm: p &= e +# asm 1: and c_all=int64#6 +# asm 2: popcnt c_all=%r9 +popcnt %r9, %r9 + +# qhasm: b64 = mem64[ buf_ptr + 0 ] +# asm 1: movq 0(b64=int64#7 +# asm 2: movq 0(b64=%rax +movq 0(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 8(b64=%rax +movq 8(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 16(b64=%rax +movq 16(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor b64=int64#7 +# asm 2: movq 24(b64=%rax +movq 24(%rcx),%rax + +# qhasm: c = count(b64) +# asm 1: popcnt c=int64#7 +# asm 2: popcnt c=%rax +popcnt %rax, %rax + +# qhasm: c_all ^= c +# asm 1: xor addr=int64#7 +# asm 2: mov addr=%rax +mov %r8,%rax + +# qhasm: (uint64) addr >>= 3 +# asm 1: shr $3,synd=int64#8 +# asm 2: movzbq 0(synd=%r10 +movzbq 0(%rax),%r10 + +# qhasm: synd <<= 1 +# asm 1: shl $1,ss=reg256#1 +# asm 2: vmovupd 0(ss=%ymm0 +vmovupd 0(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(ee=reg256#2 +# asm 2: vmovupd 0(ee=%ymm1 +vmovupd 0(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 32(ss=%ymm0 +vmovupd 32(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 32 ] +# asm 1: vmovupd 32(ee=reg256#2 +# asm 2: vmovupd 32(ee=%ymm1 +vmovupd 32(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 64(ss=%ymm0 +vmovupd 64(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 64 ] +# asm 1: vmovupd 64(ee=reg256#2 +# asm 2: vmovupd 64(ee=%ymm1 +vmovupd 64(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 96(ss=%ymm0 +vmovupd 96(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 96 ] +# asm 1: vmovupd 96(ee=reg256#2 +# asm 2: vmovupd 96(ee=%ymm1 +vmovupd 96(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 128(ss=%ymm0 +vmovupd 128(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 128 ] +# asm 1: vmovupd 128(ee=reg256#2 +# asm 2: vmovupd 128(ee=%ymm1 +vmovupd 128(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor ss=reg256#1 +# asm 2: vmovupd 160(ss=%ymm0 +vmovupd 160(%rdi),%ymm0 + +# qhasm: ee = mem256[ input_2 + 160 ] +# asm 1: vmovupd 160(ee=reg256#2 +# asm 2: vmovupd 160(ee=%ymm1 +vmovupd 160(%rdx),%ymm1 + +# qhasm: ss ^= ee +# asm 1: vpxor s=int64#2 +# asm 2: movq 192(s=%rsi +movq 192(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 192 ] +# asm 1: movq 192(e=int64#4 +# asm 2: movq 192(e=%rcx +movq 192(%rdx),%rcx + +# qhasm: s ^= e +# asm 1: xor s=int64#2 +# asm 2: movq 200(s=%rsi +movq 200(%rdi),%rsi + +# qhasm: e = mem64[ input_2 + 200 ] +# asm 1: movq 200(e=int64#3 +# asm 2: movq 200(e=%rdx +movq 200(%rdx),%rdx + +# qhasm: s ^= e +# asm 1: xor mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK5_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK5_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE8192128F_AVX_MASK5_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK5_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK5_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK5_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE8192128F_AVX_MASK4_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK4_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK4_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK4_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE8192128F_AVX_MASK4_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK4_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK4_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK4_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE8192128F_AVX_MASK3_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK3_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK3_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK3_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE8192128F_AVX_MASK3_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK3_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK3_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK3_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 128 ] +# asm 1: movdqu 128(x1=reg128#8 +# asm 2: movdqu 128(x1=%xmm7 +movdqu 128(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 256 ] +# asm 1: movdqu 256(x2=reg128#9 +# asm 2: movdqu 256(x2=%xmm8 +movdqu 256(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 384 ] +# asm 1: movdqu 384(x3=reg128#10 +# asm 2: movdqu 384(x3=%xmm9 +movdqu 384(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 512 ] +# asm 1: movdqu 512(x4=reg128#11 +# asm 2: movdqu 512(x4=%xmm10 +movdqu 512(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 640 ] +# asm 1: movdqu 640(x5=reg128#12 +# asm 2: movdqu 640(x5=%xmm11 +movdqu 640(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 768 ] +# asm 1: movdqu 768(x6=reg128#13 +# asm 2: movdqu 768(x6=%xmm12 +movdqu 768(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 896 ] +# asm 1: movdqu 896(x7=reg128#14 +# asm 2: movdqu 896(x7=%xmm13 +movdqu 896(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 16(x0=%xmm6 +movdqu 16(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x2=reg128#9 +# asm 2: movdqu 272(x2=%xmm8 +movdqu 272(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x3=reg128#10 +# asm 2: movdqu 400(x3=%xmm9 +movdqu 400(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x4=reg128#11 +# asm 2: movdqu 528(x4=%xmm10 +movdqu 528(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x5=reg128#12 +# asm 2: movdqu 656(x5=%xmm11 +movdqu 656(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x6=reg128#13 +# asm 2: movdqu 784(x6=%xmm12 +movdqu 784(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x7=reg128#14 +# asm 2: movdqu 912(x7=%xmm13 +movdqu 912(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 16 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 32(x0=%xmm6 +movdqu 32(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x1=reg128#8 +# asm 2: movdqu 160(x1=%xmm7 +movdqu 160(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x3=reg128#10 +# asm 2: movdqu 416(x3=%xmm9 +movdqu 416(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x4=reg128#11 +# asm 2: movdqu 544(x4=%xmm10 +movdqu 544(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x5=reg128#12 +# asm 2: movdqu 672(x5=%xmm11 +movdqu 672(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x6=reg128#13 +# asm 2: movdqu 800(x6=%xmm12 +movdqu 800(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x7=reg128#14 +# asm 2: movdqu 928(x7=%xmm13 +movdqu 928(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 32 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 48(x0=%xmm6 +movdqu 48(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x1=reg128#8 +# asm 2: movdqu 176(x1=%xmm7 +movdqu 176(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x2=reg128#9 +# asm 2: movdqu 304(x2=%xmm8 +movdqu 304(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x4=reg128#11 +# asm 2: movdqu 560(x4=%xmm10 +movdqu 560(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x5=reg128#12 +# asm 2: movdqu 688(x5=%xmm11 +movdqu 688(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x6=reg128#13 +# asm 2: movdqu 816(x6=%xmm12 +movdqu 816(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x7=reg128#14 +# asm 2: movdqu 944(x7=%xmm13 +movdqu 944(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 48 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 64(x0=%xmm6 +movdqu 64(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x1=reg128#8 +# asm 2: movdqu 192(x1=%xmm7 +movdqu 192(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x2=reg128#9 +# asm 2: movdqu 320(x2=%xmm8 +movdqu 320(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x3=reg128#10 +# asm 2: movdqu 448(x3=%xmm9 +movdqu 448(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x5=reg128#12 +# asm 2: movdqu 704(x5=%xmm11 +movdqu 704(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x6=reg128#13 +# asm 2: movdqu 832(x6=%xmm12 +movdqu 832(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x7=reg128#14 +# asm 2: movdqu 960(x7=%xmm13 +movdqu 960(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 64 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 80(x0=%xmm6 +movdqu 80(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x1=reg128#8 +# asm 2: movdqu 208(x1=%xmm7 +movdqu 208(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x2=reg128#9 +# asm 2: movdqu 336(x2=%xmm8 +movdqu 336(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x3=reg128#10 +# asm 2: movdqu 464(x3=%xmm9 +movdqu 464(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x4=reg128#11 +# asm 2: movdqu 592(x4=%xmm10 +movdqu 592(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x6=reg128#13 +# asm 2: movdqu 848(x6=%xmm12 +movdqu 848(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x7=reg128#14 +# asm 2: movdqu 976(x7=%xmm13 +movdqu 976(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 80 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 96(x0=%xmm6 +movdqu 96(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x1=reg128#8 +# asm 2: movdqu 224(x1=%xmm7 +movdqu 224(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x2=reg128#9 +# asm 2: movdqu 352(x2=%xmm8 +movdqu 352(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x3=reg128#10 +# asm 2: movdqu 480(x3=%xmm9 +movdqu 480(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x4=reg128#11 +# asm 2: movdqu 608(x4=%xmm10 +movdqu 608(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x5=reg128#12 +# asm 2: movdqu 736(x5=%xmm11 +movdqu 736(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x7=reg128#14 +# asm 2: movdqu 992(x7=%xmm13 +movdqu 992(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm13,%xmm15 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm11,%xmm15 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#15 +# asm 2: vpsrld $16,v01=%xmm14 +vpsrld $16,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm12,%xmm15 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm8,%xmm15 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#16 +# asm 2: vpslld $16,v10=%xmm15 +vpslld $16,%xmm9,%xmm15 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm14,%xmm15 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#14 +# asm 2: vpsrlw $8,v01=%xmm13 +vpsrlw $8,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm10,%xmm15 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm8,%xmm15 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#13 +# asm 2: vpsrlw $8,v01=%xmm12 +vpsrlw $8,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#16 +# asm 2: vpsllw $8,v10=%xmm15 +vpsllw $8,%xmm7,%xmm15 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 96 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 112(x0=%xmm6 +movdqu 112(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x1=reg128#8 +# asm 2: movdqu 240(x1=%xmm7 +movdqu 240(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x2=reg128#9 +# asm 2: movdqu 368(x2=%xmm8 +movdqu 368(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x3=reg128#10 +# asm 2: movdqu 496(x3=%xmm9 +movdqu 496(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x4=reg128#11 +# asm 2: movdqu 624(x4=%xmm10 +movdqu 624(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x5=reg128#12 +# asm 2: movdqu 752(x5=%xmm11 +movdqu 752(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x6=reg128#13 +# asm 2: movdqu 880(x6=%xmm12 +movdqu 880(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: 2x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm10,%xmm15 + +# qhasm: 2x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#7 +# asm 2: vpsrlq $32,v01=%xmm6 +vpsrlq $32,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: 2x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm11,%xmm15 + +# qhasm: 2x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#8 +# asm 2: vpsrlq $32,v01=%xmm7 +vpsrlq $32,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: 2x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg128#16 +# asm 2: vpsllq $32,v10=%xmm15 +vpsllq $32,%xmm12,%xmm15 + +# qhasm: 2x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#9 +# asm 2: vpsrlq $32,v01=%xmm8 +vpsrlq $32,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#1 +# asm 2: vpand v00=%xmm0 +vpand %xmm0,%xmm9,%xmm0 + +# qhasm: 2x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg128#13 +# asm 2: vpsllq $32,v10=%xmm12 +vpsllq $32,%xmm13,%xmm12 + +# qhasm: 2x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg128#10 +# asm 2: vpsrlq $32,v01=%xmm9 +vpsrlq $32,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm12,%xmm0,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: 4x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg128#13 +# asm 2: vpslld $16,v10=%xmm12 +vpslld $16,%xmm11,%xmm12 + +# qhasm: 4x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#14 +# asm 2: vpsrld $16,v01=%xmm13 +vpsrld $16,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: 4x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm0,%xmm13 + +# qhasm: 4x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#11 +# asm 2: vpsrld $16,v01=%xmm10 +vpsrld $16,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: 4x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg128#14 +# asm 2: vpslld $16,v10=%xmm13 +vpslld $16,%xmm8,%xmm13 + +# qhasm: 4x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#7 +# asm 2: vpsrld $16,v01=%xmm6 +vpsrld $16,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#3 +# asm 2: vpand v00=%xmm2 +vpand %xmm2,%xmm7,%xmm2 + +# qhasm: 4x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg128#9 +# asm 2: vpslld $16,v10=%xmm8 +vpslld $16,%xmm1,%xmm8 + +# qhasm: 4x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg128#8 +# asm 2: vpsrld $16,v01=%xmm7 +vpsrld $16,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm8,%xmm2,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: 8x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg128#8 +# asm 2: vpsllw $8,v10=%xmm7 +vpsllw $8,%xmm12,%xmm7 + +# qhasm: 8x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#9 +# asm 2: vpsrlw $8,v01=%xmm8 +vpsrlw $8,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: 8x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg128#10 +# asm 2: vpsllw $8,v10=%xmm9 +vpsllw $8,%xmm0,%xmm9 + +# qhasm: 8x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#12 +# asm 2: vpsrlw $8,v01=%xmm11 +vpsrlw $8,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: 8x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg128#12 +# asm 2: vpsllw $8,v10=%xmm11 +vpsllw $8,%xmm2,%xmm11 + +# qhasm: 8x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#11 +# asm 2: vpsrlw $8,v01=%xmm10 +vpsrlw $8,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#5 +# asm 2: vpand v00=%xmm4 +vpand %xmm4,%xmm6,%xmm4 + +# qhasm: 8x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg128#11 +# asm 2: vpsllw $8,v10=%xmm10 +vpsllw $8,%xmm1,%xmm10 + +# qhasm: 8x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg128#7 +# asm 2: vpsrlw $8,v01=%xmm6 +vpsrlw $8,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm10,%xmm4,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = x0 +# asm 1: movdqu mask0=reg128#1 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK2_0(%rip),>mask0=%xmm0 +movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK2_0(%rip),%xmm0 + +# qhasm: mask1 aligned= mem128[ PQCLEAN_MCELIECE8192128F_AVX_MASK2_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK2_1(%rip),>mask1=reg128#2 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK2_1(%rip),>mask1=%xmm1 +movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK2_1(%rip),%xmm1 + +# qhasm: mask2 aligned= mem128[ PQCLEAN_MCELIECE8192128F_AVX_MASK1_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK1_0(%rip),>mask2=reg128#3 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK1_0(%rip),>mask2=%xmm2 +movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK1_0(%rip),%xmm2 + +# qhasm: mask3 aligned= mem128[ PQCLEAN_MCELIECE8192128F_AVX_MASK1_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK1_1(%rip),>mask3=reg128#4 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK1_1(%rip),>mask3=%xmm3 +movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK1_1(%rip),%xmm3 + +# qhasm: mask4 aligned= mem128[ PQCLEAN_MCELIECE8192128F_AVX_MASK0_0 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK0_0(%rip),>mask4=reg128#5 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK0_0(%rip),>mask4=%xmm4 +movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK0_0(%rip),%xmm4 + +# qhasm: mask5 aligned= mem128[ PQCLEAN_MCELIECE8192128F_AVX_MASK0_1 ] +# asm 1: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK0_1(%rip),>mask5=reg128#6 +# asm 2: movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK0_1(%rip),>mask5=%xmm5 +movdqa PQCLEAN_MCELIECE8192128F_AVX_MASK0_1(%rip),%xmm5 + +# qhasm: x0 = mem128[ input_0 + 0 ] +# asm 1: movdqu 0(x0=reg128#7 +# asm 2: movdqu 0(x0=%xmm6 +movdqu 0(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 16 ] +# asm 1: movdqu 16(x1=reg128#8 +# asm 2: movdqu 16(x1=%xmm7 +movdqu 16(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 32 ] +# asm 1: movdqu 32(x2=reg128#9 +# asm 2: movdqu 32(x2=%xmm8 +movdqu 32(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 48 ] +# asm 1: movdqu 48(x3=reg128#10 +# asm 2: movdqu 48(x3=%xmm9 +movdqu 48(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 64 ] +# asm 1: movdqu 64(x4=reg128#11 +# asm 2: movdqu 64(x4=%xmm10 +movdqu 64(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 80 ] +# asm 1: movdqu 80(x5=reg128#12 +# asm 2: movdqu 80(x5=%xmm11 +movdqu 80(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 96 ] +# asm 1: movdqu 96(x6=reg128#13 +# asm 2: movdqu 96(x6=%xmm12 +movdqu 96(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 112 ] +# asm 1: movdqu 112(x7=reg128#14 +# asm 2: movdqu 112(x7=%xmm13 +movdqu 112(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 0 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 128(x0=%xmm6 +movdqu 128(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 144 ] +# asm 1: movdqu 144(x1=reg128#8 +# asm 2: movdqu 144(x1=%xmm7 +movdqu 144(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 160 ] +# asm 1: movdqu 160(x2=reg128#9 +# asm 2: movdqu 160(x2=%xmm8 +movdqu 160(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 176 ] +# asm 1: movdqu 176(x3=reg128#10 +# asm 2: movdqu 176(x3=%xmm9 +movdqu 176(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 192 ] +# asm 1: movdqu 192(x4=reg128#11 +# asm 2: movdqu 192(x4=%xmm10 +movdqu 192(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 208 ] +# asm 1: movdqu 208(x5=reg128#12 +# asm 2: movdqu 208(x5=%xmm11 +movdqu 208(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 224 ] +# asm 1: movdqu 224(x6=reg128#13 +# asm 2: movdqu 224(x6=%xmm12 +movdqu 224(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 240 ] +# asm 1: movdqu 240(x7=reg128#14 +# asm 2: movdqu 240(x7=%xmm13 +movdqu 240(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 128 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 256(x0=%xmm6 +movdqu 256(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 272 ] +# asm 1: movdqu 272(x1=reg128#8 +# asm 2: movdqu 272(x1=%xmm7 +movdqu 272(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 288 ] +# asm 1: movdqu 288(x2=reg128#9 +# asm 2: movdqu 288(x2=%xmm8 +movdqu 288(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 304 ] +# asm 1: movdqu 304(x3=reg128#10 +# asm 2: movdqu 304(x3=%xmm9 +movdqu 304(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 320 ] +# asm 1: movdqu 320(x4=reg128#11 +# asm 2: movdqu 320(x4=%xmm10 +movdqu 320(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 336 ] +# asm 1: movdqu 336(x5=reg128#12 +# asm 2: movdqu 336(x5=%xmm11 +movdqu 336(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 352 ] +# asm 1: movdqu 352(x6=reg128#13 +# asm 2: movdqu 352(x6=%xmm12 +movdqu 352(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 368 ] +# asm 1: movdqu 368(x7=reg128#14 +# asm 2: movdqu 368(x7=%xmm13 +movdqu 368(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 256 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 384(x0=%xmm6 +movdqu 384(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 400 ] +# asm 1: movdqu 400(x1=reg128#8 +# asm 2: movdqu 400(x1=%xmm7 +movdqu 400(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 416 ] +# asm 1: movdqu 416(x2=reg128#9 +# asm 2: movdqu 416(x2=%xmm8 +movdqu 416(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 432 ] +# asm 1: movdqu 432(x3=reg128#10 +# asm 2: movdqu 432(x3=%xmm9 +movdqu 432(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 448 ] +# asm 1: movdqu 448(x4=reg128#11 +# asm 2: movdqu 448(x4=%xmm10 +movdqu 448(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 464 ] +# asm 1: movdqu 464(x5=reg128#12 +# asm 2: movdqu 464(x5=%xmm11 +movdqu 464(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 480 ] +# asm 1: movdqu 480(x6=reg128#13 +# asm 2: movdqu 480(x6=%xmm12 +movdqu 480(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 496 ] +# asm 1: movdqu 496(x7=reg128#14 +# asm 2: movdqu 496(x7=%xmm13 +movdqu 496(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 384 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 512(x0=%xmm6 +movdqu 512(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 528 ] +# asm 1: movdqu 528(x1=reg128#8 +# asm 2: movdqu 528(x1=%xmm7 +movdqu 528(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 544 ] +# asm 1: movdqu 544(x2=reg128#9 +# asm 2: movdqu 544(x2=%xmm8 +movdqu 544(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 560 ] +# asm 1: movdqu 560(x3=reg128#10 +# asm 2: movdqu 560(x3=%xmm9 +movdqu 560(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 576 ] +# asm 1: movdqu 576(x4=reg128#11 +# asm 2: movdqu 576(x4=%xmm10 +movdqu 576(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 592 ] +# asm 1: movdqu 592(x5=reg128#12 +# asm 2: movdqu 592(x5=%xmm11 +movdqu 592(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 608 ] +# asm 1: movdqu 608(x6=reg128#13 +# asm 2: movdqu 608(x6=%xmm12 +movdqu 608(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 624 ] +# asm 1: movdqu 624(x7=reg128#14 +# asm 2: movdqu 624(x7=%xmm13 +movdqu 624(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 512 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 640(x0=%xmm6 +movdqu 640(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 656 ] +# asm 1: movdqu 656(x1=reg128#8 +# asm 2: movdqu 656(x1=%xmm7 +movdqu 656(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 672 ] +# asm 1: movdqu 672(x2=reg128#9 +# asm 2: movdqu 672(x2=%xmm8 +movdqu 672(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 688 ] +# asm 1: movdqu 688(x3=reg128#10 +# asm 2: movdqu 688(x3=%xmm9 +movdqu 688(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 704 ] +# asm 1: movdqu 704(x4=reg128#11 +# asm 2: movdqu 704(x4=%xmm10 +movdqu 704(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 720 ] +# asm 1: movdqu 720(x5=reg128#12 +# asm 2: movdqu 720(x5=%xmm11 +movdqu 720(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 736 ] +# asm 1: movdqu 736(x6=reg128#13 +# asm 2: movdqu 736(x6=%xmm12 +movdqu 736(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 752 ] +# asm 1: movdqu 752(x7=reg128#14 +# asm 2: movdqu 752(x7=%xmm13 +movdqu 752(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 640 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 768(x0=%xmm6 +movdqu 768(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 784 ] +# asm 1: movdqu 784(x1=reg128#8 +# asm 2: movdqu 784(x1=%xmm7 +movdqu 784(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 800 ] +# asm 1: movdqu 800(x2=reg128#9 +# asm 2: movdqu 800(x2=%xmm8 +movdqu 800(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 816 ] +# asm 1: movdqu 816(x3=reg128#10 +# asm 2: movdqu 816(x3=%xmm9 +movdqu 816(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 832 ] +# asm 1: movdqu 832(x4=reg128#11 +# asm 2: movdqu 832(x4=%xmm10 +movdqu 832(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 848 ] +# asm 1: movdqu 848(x5=reg128#12 +# asm 2: movdqu 848(x5=%xmm11 +movdqu 848(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 864 ] +# asm 1: movdqu 864(x6=reg128#13 +# asm 2: movdqu 864(x6=%xmm12 +movdqu 864(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 880 ] +# asm 1: movdqu 880(x7=reg128#14 +# asm 2: movdqu 880(x7=%xmm13 +movdqu 880(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm13,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#14 +# asm 2: vpand v11=%xmm13 +vpand %xmm1,%xmm13,%xmm13 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#13 +# asm 2: vpor x3=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#10 +# asm 2: vpor x7=%xmm9 +vpor %xmm13,%xmm9,%xmm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#14 +# asm 2: vpand v00=%xmm13 +vpand %xmm2,%xmm14,%xmm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#15 +# asm 2: vpand v01=%xmm14 +vpand %xmm3,%xmm14,%xmm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#14 +# asm 2: vpor x0=%xmm13 +vpor %xmm15,%xmm13,%xmm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm14,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm2,%xmm10,%xmm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm3,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#15 +# asm 2: vpor x1=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm12,%xmm10,%xmm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm6,%xmm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#13 +# asm 2: vpor x4=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm2,%xmm9,%xmm15 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm3,%xmm9,%xmm9 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm15,%xmm8,%xmm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#8 +# asm 2: vpor x7=%xmm7 +vpor %xmm9,%xmm7,%xmm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm13,%xmm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm14,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm5,%xmm13,%xmm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#15 +# asm 2: vpand v11=%xmm14 +vpand %xmm5,%xmm14,%xmm14 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm15,%xmm9,%xmm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#14 +# asm 2: vpor x1=%xmm13 +vpor %xmm14,%xmm13,%xmm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm4,%xmm11,%xmm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#15 +# asm 2: vpor x2=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#11 +# asm 2: vpor x3=%xmm10 +vpor %xmm10,%xmm11,%xmm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm4,%xmm12,%xmm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm8,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#13 +# asm 2: vpand v01=%xmm12 +vpand %xmm5,%xmm12,%xmm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm5,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#12 +# asm 2: vpor x4=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#9 +# asm 2: vpor x5=%xmm8 +vpor %xmm8,%xmm12,%xmm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm4,%xmm6,%xmm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm4,%xmm7,%xmm15 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#8 +# asm 2: vpand v11=%xmm7 +vpand %xmm5,%xmm7,%xmm7 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#13 +# asm 2: vpor x6=%xmm12 +vpor %xmm15,%xmm12,%xmm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#7 +# asm 2: vpor x7=%xmm6 +vpor %xmm7,%xmm6,%xmm6 + +# qhasm: mem128[ input_0 + 768 ] = x0 +# asm 1: movdqu x0=reg128#7 +# asm 2: movdqu 896(x0=%xmm6 +movdqu 896(%rdi),%xmm6 + +# qhasm: x1 = mem128[ input_0 + 912 ] +# asm 1: movdqu 912(x1=reg128#8 +# asm 2: movdqu 912(x1=%xmm7 +movdqu 912(%rdi),%xmm7 + +# qhasm: x2 = mem128[ input_0 + 928 ] +# asm 1: movdqu 928(x2=reg128#9 +# asm 2: movdqu 928(x2=%xmm8 +movdqu 928(%rdi),%xmm8 + +# qhasm: x3 = mem128[ input_0 + 944 ] +# asm 1: movdqu 944(x3=reg128#10 +# asm 2: movdqu 944(x3=%xmm9 +movdqu 944(%rdi),%xmm9 + +# qhasm: x4 = mem128[ input_0 + 960 ] +# asm 1: movdqu 960(x4=reg128#11 +# asm 2: movdqu 960(x4=%xmm10 +movdqu 960(%rdi),%xmm10 + +# qhasm: x5 = mem128[ input_0 + 976 ] +# asm 1: movdqu 976(x5=reg128#12 +# asm 2: movdqu 976(x5=%xmm11 +movdqu 976(%rdi),%xmm11 + +# qhasm: x6 = mem128[ input_0 + 992 ] +# asm 1: movdqu 992(x6=reg128#13 +# asm 2: movdqu 992(x6=%xmm12 +movdqu 992(%rdi),%xmm12 + +# qhasm: x7 = mem128[ input_0 + 1008 ] +# asm 1: movdqu 1008(x7=reg128#14 +# asm 2: movdqu 1008(x7=%xmm13 +movdqu 1008(%rdi),%xmm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg128#15 +# asm 2: vpand v00=%xmm14 +vpand %xmm0,%xmm6,%xmm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm10,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm1,%xmm6,%xmm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg128#11 +# asm 2: vpand v11=%xmm10 +vpand %xmm1,%xmm10,%xmm10 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x0=reg128#15 +# asm 2: vpor x0=%xmm14 +vpor %xmm15,%xmm14,%xmm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg128#7 +# asm 2: vpor x4=%xmm6 +vpor %xmm10,%xmm6,%xmm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm0,%xmm7,%xmm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm11,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm1,%xmm7,%xmm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm1,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x1=reg128#11 +# asm 2: vpor x1=%xmm10 +vpor %xmm15,%xmm10,%xmm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#8 +# asm 2: vpor x5=%xmm7 +vpor %xmm11,%xmm7,%xmm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg128#12 +# asm 2: vpand v00=%xmm11 +vpand %xmm0,%xmm8,%xmm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg128#16 +# asm 2: vpand v10=%xmm15 +vpand %xmm0,%xmm12,%xmm15 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm1,%xmm8,%xmm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg128#13 +# asm 2: vpand v11=%xmm12 +vpand %xmm1,%xmm12,%xmm12 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm15,%xmm11,%xmm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#9 +# asm 2: vpor x6=%xmm8 +vpor %xmm12,%xmm8,%xmm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm0,%xmm9,%xmm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg128#1 +# asm 2: vpand v10=%xmm0 +vpand %xmm0,%xmm13,%xmm0 + +# qhasm: 2x v10 <<= 4 +# asm 1: psllq $4,v01=reg128#10 +# asm 2: vpand v01=%xmm9 +vpand %xmm1,%xmm9,%xmm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm1,%xmm13,%xmm1 + +# qhasm: 2x v01 unsigned>>= 4 +# asm 1: psrlq $4,x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm12,%xmm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm9,%xmm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm2,%xmm14,%xmm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg128#13 +# asm 2: vpand v10=%xmm12 +vpand %xmm2,%xmm11,%xmm12 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#14 +# asm 2: vpand v01=%xmm13 +vpand %xmm3,%xmm14,%xmm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg128#12 +# asm 2: vpand v11=%xmm11 +vpand %xmm3,%xmm11,%xmm11 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x0=reg128#10 +# asm 2: vpor x0=%xmm9 +vpor %xmm12,%xmm9,%xmm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg128#12 +# asm 2: vpor x2=%xmm11 +vpor %xmm11,%xmm13,%xmm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg128#13 +# asm 2: vpand v00=%xmm12 +vpand %xmm2,%xmm10,%xmm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm0,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm3,%xmm10,%xmm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm3,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x1=reg128#13 +# asm 2: vpor x1=%xmm12 +vpor %xmm13,%xmm12,%xmm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm10,%xmm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm2,%xmm6,%xmm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg128#14 +# asm 2: vpand v10=%xmm13 +vpand %xmm2,%xmm8,%xmm13 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm3,%xmm6,%xmm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg128#9 +# asm 2: vpand v11=%xmm8 +vpand %xmm3,%xmm8,%xmm8 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x4=reg128#11 +# asm 2: vpor x4=%xmm10 +vpor %xmm13,%xmm10,%xmm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg128#7 +# asm 2: vpor x6=%xmm6 +vpor %xmm8,%xmm6,%xmm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm2,%xmm7,%xmm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg128#3 +# asm 2: vpand v10=%xmm2 +vpand %xmm2,%xmm1,%xmm2 + +# qhasm: 2x v10 <<= 2 +# asm 1: psllq $2,v01=reg128#8 +# asm 2: vpand v01=%xmm7 +vpand %xmm3,%xmm7,%xmm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm3,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 2 +# asm 1: psrlq $2,x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm8,%xmm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm7,%xmm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg128#4 +# asm 2: vpand v00=%xmm3 +vpand %xmm4,%xmm9,%xmm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg128#8 +# asm 2: vpand v10=%xmm7 +vpand %xmm4,%xmm12,%xmm7 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#9 +# asm 2: vpand v01=%xmm8 +vpand %xmm5,%xmm9,%xmm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg128#10 +# asm 2: vpand v11=%xmm9 +vpand %xmm5,%xmm12,%xmm9 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x0=reg128#4 +# asm 2: vpor x0=%xmm3 +vpor %xmm7,%xmm3,%xmm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg128#8 +# asm 2: vpor x1=%xmm7 +vpor %xmm9,%xmm8,%xmm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg128#9 +# asm 2: vpand v00=%xmm8 +vpand %xmm4,%xmm11,%xmm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg128#10 +# asm 2: vpand v10=%xmm9 +vpand %xmm4,%xmm0,%xmm9 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#12 +# asm 2: vpand v01=%xmm11 +vpand %xmm5,%xmm11,%xmm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg128#1 +# asm 2: vpand v11=%xmm0 +vpand %xmm5,%xmm0,%xmm0 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x2=reg128#9 +# asm 2: vpor x2=%xmm8 +vpor %xmm9,%xmm8,%xmm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg128#1 +# asm 2: vpor x3=%xmm0 +vpor %xmm0,%xmm11,%xmm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg128#10 +# asm 2: vpand v00=%xmm9 +vpand %xmm4,%xmm10,%xmm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg128#12 +# asm 2: vpand v10=%xmm11 +vpand %xmm4,%xmm2,%xmm11 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#11 +# asm 2: vpand v01=%xmm10 +vpand %xmm5,%xmm10,%xmm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg128#3 +# asm 2: vpand v11=%xmm2 +vpand %xmm5,%xmm2,%xmm2 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x4=reg128#10 +# asm 2: vpor x4=%xmm9 +vpor %xmm11,%xmm9,%xmm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg128#3 +# asm 2: vpor x5=%xmm2 +vpor %xmm2,%xmm10,%xmm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg128#11 +# asm 2: vpand v00=%xmm10 +vpand %xmm4,%xmm6,%xmm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg128#5 +# asm 2: vpand v10=%xmm4 +vpand %xmm4,%xmm1,%xmm4 + +# qhasm: 2x v10 <<= 1 +# asm 1: psllq $1,v01=reg128#7 +# asm 2: vpand v01=%xmm6 +vpand %xmm5,%xmm6,%xmm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg128#2 +# asm 2: vpand v11=%xmm1 +vpand %xmm5,%xmm1,%xmm1 + +# qhasm: 2x v01 unsigned>>= 1 +# asm 1: psrlq $1,x6=reg128#5 +# asm 2: vpor x6=%xmm4 +vpor %xmm4,%xmm10,%xmm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg128#2 +# asm 2: vpor x7=%xmm1 +vpor %xmm1,%xmm6,%xmm1 + +# qhasm: mem128[ input_0 + 896 ] = x0 +# asm 1: movdqu mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK5_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK5_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE8192128F_AVX_MASK5_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK5_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK5_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK5_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE8192128F_AVX_MASK4_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK4_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK4_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK4_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE8192128F_AVX_MASK4_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK4_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK4_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK4_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE8192128F_AVX_MASK3_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK3_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK3_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK3_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE8192128F_AVX_MASK3_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK3_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK3_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK3_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(x1=reg256#8 +# asm 2: vmovupd 256(x1=%ymm7 +vmovupd 256(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 512 ] +# asm 1: vmovupd 512(x2=reg256#9 +# asm 2: vmovupd 512(x2=%ymm8 +vmovupd 512(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 768 ] +# asm 1: vmovupd 768(x3=reg256#10 +# asm 2: vmovupd 768(x3=%ymm9 +vmovupd 768(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1024 ] +# asm 1: vmovupd 1024(x4=reg256#11 +# asm 2: vmovupd 1024(x4=%ymm10 +vmovupd 1024(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1280 ] +# asm 1: vmovupd 1280(x5=reg256#12 +# asm 2: vmovupd 1280(x5=%ymm11 +vmovupd 1280(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1536 ] +# asm 1: vmovupd 1536(x6=reg256#13 +# asm 2: vmovupd 1536(x6=%ymm12 +vmovupd 1536(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1792 ] +# asm 1: vmovupd 1792(x7=reg256#14 +# asm 2: vmovupd 1792(x7=%ymm13 +vmovupd 1792(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 32(x0=%ymm6 +vmovupd 32(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x2=reg256#9 +# asm 2: vmovupd 544(x2=%ymm8 +vmovupd 544(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x3=reg256#10 +# asm 2: vmovupd 800(x3=%ymm9 +vmovupd 800(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x4=reg256#11 +# asm 2: vmovupd 1056(x4=%ymm10 +vmovupd 1056(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x5=reg256#12 +# asm 2: vmovupd 1312(x5=%ymm11 +vmovupd 1312(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x6=reg256#13 +# asm 2: vmovupd 1568(x6=%ymm12 +vmovupd 1568(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x7=reg256#14 +# asm 2: vmovupd 1824(x7=%ymm13 +vmovupd 1824(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 32 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 64(x0=%ymm6 +vmovupd 64(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x1=reg256#8 +# asm 2: vmovupd 320(x1=%ymm7 +vmovupd 320(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x3=reg256#10 +# asm 2: vmovupd 832(x3=%ymm9 +vmovupd 832(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x4=reg256#11 +# asm 2: vmovupd 1088(x4=%ymm10 +vmovupd 1088(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x5=reg256#12 +# asm 2: vmovupd 1344(x5=%ymm11 +vmovupd 1344(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x6=reg256#13 +# asm 2: vmovupd 1600(x6=%ymm12 +vmovupd 1600(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x7=reg256#14 +# asm 2: vmovupd 1856(x7=%ymm13 +vmovupd 1856(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 64 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 96(x0=%ymm6 +vmovupd 96(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x1=reg256#8 +# asm 2: vmovupd 352(x1=%ymm7 +vmovupd 352(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x2=reg256#9 +# asm 2: vmovupd 608(x2=%ymm8 +vmovupd 608(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x4=reg256#11 +# asm 2: vmovupd 1120(x4=%ymm10 +vmovupd 1120(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x5=reg256#12 +# asm 2: vmovupd 1376(x5=%ymm11 +vmovupd 1376(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x6=reg256#13 +# asm 2: vmovupd 1632(x6=%ymm12 +vmovupd 1632(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x7=reg256#14 +# asm 2: vmovupd 1888(x7=%ymm13 +vmovupd 1888(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 96 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 128(x0=%ymm6 +vmovupd 128(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x1=reg256#8 +# asm 2: vmovupd 384(x1=%ymm7 +vmovupd 384(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x2=reg256#9 +# asm 2: vmovupd 640(x2=%ymm8 +vmovupd 640(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x3=reg256#10 +# asm 2: vmovupd 896(x3=%ymm9 +vmovupd 896(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x5=reg256#12 +# asm 2: vmovupd 1408(x5=%ymm11 +vmovupd 1408(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x6=reg256#13 +# asm 2: vmovupd 1664(x6=%ymm12 +vmovupd 1664(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x7=reg256#14 +# asm 2: vmovupd 1920(x7=%ymm13 +vmovupd 1920(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 128 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 160(x0=%ymm6 +vmovupd 160(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x1=reg256#8 +# asm 2: vmovupd 416(x1=%ymm7 +vmovupd 416(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x2=reg256#9 +# asm 2: vmovupd 672(x2=%ymm8 +vmovupd 672(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x3=reg256#10 +# asm 2: vmovupd 928(x3=%ymm9 +vmovupd 928(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x4=reg256#11 +# asm 2: vmovupd 1184(x4=%ymm10 +vmovupd 1184(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x6=reg256#13 +# asm 2: vmovupd 1696(x6=%ymm12 +vmovupd 1696(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x7=reg256#14 +# asm 2: vmovupd 1952(x7=%ymm13 +vmovupd 1952(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 160 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 192(x0=%ymm6 +vmovupd 192(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x1=reg256#8 +# asm 2: vmovupd 448(x1=%ymm7 +vmovupd 448(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x2=reg256#9 +# asm 2: vmovupd 704(x2=%ymm8 +vmovupd 704(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x3=reg256#10 +# asm 2: vmovupd 960(x3=%ymm9 +vmovupd 960(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x4=reg256#11 +# asm 2: vmovupd 1216(x4=%ymm10 +vmovupd 1216(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x5=reg256#12 +# asm 2: vmovupd 1472(x5=%ymm11 +vmovupd 1472(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x7=reg256#14 +# asm 2: vmovupd 1984(x7=%ymm13 +vmovupd 1984(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm13,%ymm15 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm11,%ymm15 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#15 +# asm 2: vpsrld $16,v01=%ymm14 +vpsrld $16,%ymm14,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm12,%ymm15 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm8,%ymm15 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#16 +# asm 2: vpslld $16,v10=%ymm15 +vpslld $16,%ymm9,%ymm15 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm14,%ymm15 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#14 +# asm 2: vpsrlw $8,v01=%ymm13 +vpsrlw $8,%ymm13,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm10,%ymm15 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm8,%ymm15 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#13 +# asm 2: vpsrlw $8,v01=%ymm12 +vpsrlw $8,%ymm12,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#16 +# asm 2: vpsllw $8,v10=%ymm15 +vpsllw $8,%ymm7,%ymm15 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 192 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 224(x0=%ymm6 +vmovupd 224(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x1=reg256#8 +# asm 2: vmovupd 480(x1=%ymm7 +vmovupd 480(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x2=reg256#9 +# asm 2: vmovupd 736(x2=%ymm8 +vmovupd 736(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x3=reg256#10 +# asm 2: vmovupd 992(x3=%ymm9 +vmovupd 992(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x4=reg256#11 +# asm 2: vmovupd 1248(x4=%ymm10 +vmovupd 1248(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x5=reg256#12 +# asm 2: vmovupd 1504(x5=%ymm11 +vmovupd 1504(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x6=reg256#13 +# asm 2: vmovupd 1760(x6=%ymm12 +vmovupd 1760(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: 4x v10 = x4 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm10,%ymm15 + +# qhasm: 4x v01 = x0 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#7 +# asm 2: vpsrlq $32,v01=%ymm6 +vpsrlq $32,%ymm6,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: 4x v10 = x5 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm11,%ymm15 + +# qhasm: 4x v01 = x1 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#8 +# asm 2: vpsrlq $32,v01=%ymm7 +vpsrlq $32,%ymm7,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: 4x v10 = x6 << 32 +# asm 1: vpsllq $32,v10=reg256#16 +# asm 2: vpsllq $32,v10=%ymm15 +vpsllq $32,%ymm12,%ymm15 + +# qhasm: 4x v01 = x2 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#9 +# asm 2: vpsrlq $32,v01=%ymm8 +vpsrlq $32,%ymm8,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#1 +# asm 2: vpand v00=%ymm0 +vpand %ymm9,%ymm0,%ymm0 + +# qhasm: 4x v10 = x7 << 32 +# asm 1: vpsllq $32,v10=reg256#13 +# asm 2: vpsllq $32,v10=%ymm12 +vpsllq $32,%ymm13,%ymm12 + +# qhasm: 4x v01 = x3 unsigned>> 32 +# asm 1: vpsrlq $32,v01=reg256#10 +# asm 2: vpsrlq $32,v01=%ymm9 +vpsrlq $32,%ymm9,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: x3 = v00 | v10 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm0,%ymm12,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: 8x v10 = x2 << 16 +# asm 1: vpslld $16,v10=reg256#13 +# asm 2: vpslld $16,v10=%ymm12 +vpslld $16,%ymm11,%ymm12 + +# qhasm: 8x v01 = x0 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#14 +# asm 2: vpsrld $16,v01=%ymm13 +vpsrld $16,%ymm14,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: 8x v10 = x3 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm0,%ymm13 + +# qhasm: 8x v01 = x1 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#11 +# asm 2: vpsrld $16,v01=%ymm10 +vpsrld $16,%ymm10,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: x1 = v00 | v10 +# asm 1: vpor x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: 8x v10 = x6 << 16 +# asm 1: vpslld $16,v10=reg256#14 +# asm 2: vpslld $16,v10=%ymm13 +vpslld $16,%ymm8,%ymm13 + +# qhasm: 8x v01 = x4 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#7 +# asm 2: vpsrld $16,v01=%ymm6 +vpsrld $16,%ymm6,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#3 +# asm 2: vpand v00=%ymm2 +vpand %ymm7,%ymm2,%ymm2 + +# qhasm: 8x v10 = x7 << 16 +# asm 1: vpslld $16,v10=reg256#9 +# asm 2: vpslld $16,v10=%ymm8 +vpslld $16,%ymm1,%ymm8 + +# qhasm: 8x v01 = x5 unsigned>> 16 +# asm 1: vpsrld $16,v01=reg256#8 +# asm 2: vpsrld $16,v01=%ymm7 +vpsrld $16,%ymm7,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: x5 = v00 | v10 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm2,%ymm8,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: 16x v10 = x1 << 8 +# asm 1: vpsllw $8,v10=reg256#8 +# asm 2: vpsllw $8,v10=%ymm7 +vpsllw $8,%ymm12,%ymm7 + +# qhasm: 16x v01 = x0 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#9 +# asm 2: vpsrlw $8,v01=%ymm8 +vpsrlw $8,%ymm9,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: x0 = v00 | v10 +# asm 1: vpor x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: 16x v10 = x3 << 8 +# asm 1: vpsllw $8,v10=reg256#10 +# asm 2: vpsllw $8,v10=%ymm9 +vpsllw $8,%ymm0,%ymm9 + +# qhasm: 16x v01 = x2 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#12 +# asm 2: vpsrlw $8,v01=%ymm11 +vpsrlw $8,%ymm11,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: x2 = v00 | v10 +# asm 1: vpor x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: 16x v10 = x5 << 8 +# asm 1: vpsllw $8,v10=reg256#12 +# asm 2: vpsllw $8,v10=%ymm11 +vpsllw $8,%ymm2,%ymm11 + +# qhasm: 16x v01 = x4 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#11 +# asm 2: vpsrlw $8,v01=%ymm10 +vpsrlw $8,%ymm10,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: x4 = v00 | v10 +# asm 1: vpor x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#5 +# asm 2: vpand v00=%ymm4 +vpand %ymm6,%ymm4,%ymm4 + +# qhasm: 16x v10 = x7 << 8 +# asm 1: vpsllw $8,v10=reg256#11 +# asm 2: vpsllw $8,v10=%ymm10 +vpsllw $8,%ymm1,%ymm10 + +# qhasm: 16x v01 = x6 unsigned>> 8 +# asm 1: vpsrlw $8,v01=reg256#7 +# asm 2: vpsrlw $8,v01=%ymm6 +vpsrlw $8,%ymm6,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: x6 = v00 | v10 +# asm 1: vpor x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm4,%ymm10,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 224 ] = x0 +# asm 1: vmovupd mask0=reg256#1 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK2_0,>mask0=%ymm0 +vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK2_0(%rip),%ymm0 + +# qhasm: mask1 aligned= mem256[ PQCLEAN_MCELIECE8192128F_AVX_MASK2_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK2_1,>mask1=reg256#2 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK2_1,>mask1=%ymm1 +vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK2_1(%rip),%ymm1 + +# qhasm: mask2 aligned= mem256[ PQCLEAN_MCELIECE8192128F_AVX_MASK1_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK1_0,>mask2=reg256#3 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK1_0,>mask2=%ymm2 +vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK1_0(%rip),%ymm2 + +# qhasm: mask3 aligned= mem256[ PQCLEAN_MCELIECE8192128F_AVX_MASK1_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK1_1,>mask3=reg256#4 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK1_1,>mask3=%ymm3 +vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK1_1(%rip),%ymm3 + +# qhasm: mask4 aligned= mem256[ PQCLEAN_MCELIECE8192128F_AVX_MASK0_0 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK0_0,>mask4=reg256#5 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK0_0,>mask4=%ymm4 +vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK0_0(%rip),%ymm4 + +# qhasm: mask5 aligned= mem256[ PQCLEAN_MCELIECE8192128F_AVX_MASK0_1 ] +# asm 1: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK0_1,>mask5=reg256#6 +# asm 2: vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK0_1,>mask5=%ymm5 +vmovapd PQCLEAN_MCELIECE8192128F_AVX_MASK0_1(%rip),%ymm5 + +# qhasm: x0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(x0=reg256#7 +# asm 2: vmovupd 0(x0=%ymm6 +vmovupd 0(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(x1=reg256#8 +# asm 2: vmovupd 32(x1=%ymm7 +vmovupd 32(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(x2=reg256#9 +# asm 2: vmovupd 64(x2=%ymm8 +vmovupd 64(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(x3=reg256#10 +# asm 2: vmovupd 96(x3=%ymm9 +vmovupd 96(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(x4=reg256#11 +# asm 2: vmovupd 128(x4=%ymm10 +vmovupd 128(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(x5=reg256#12 +# asm 2: vmovupd 160(x5=%ymm11 +vmovupd 160(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(x6=reg256#13 +# asm 2: vmovupd 192(x6=%ymm12 +vmovupd 192(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(x7=reg256#14 +# asm 2: vmovupd 224(x7=%ymm13 +vmovupd 224(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 0 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 256(x0=%ymm6 +vmovupd 256(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(x1=reg256#8 +# asm 2: vmovupd 288(x1=%ymm7 +vmovupd 288(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(x2=reg256#9 +# asm 2: vmovupd 320(x2=%ymm8 +vmovupd 320(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(x3=reg256#10 +# asm 2: vmovupd 352(x3=%ymm9 +vmovupd 352(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(x4=reg256#11 +# asm 2: vmovupd 384(x4=%ymm10 +vmovupd 384(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 416 ] +# asm 1: vmovupd 416(x5=reg256#12 +# asm 2: vmovupd 416(x5=%ymm11 +vmovupd 416(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 448 ] +# asm 1: vmovupd 448(x6=reg256#13 +# asm 2: vmovupd 448(x6=%ymm12 +vmovupd 448(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 480 ] +# asm 1: vmovupd 480(x7=reg256#14 +# asm 2: vmovupd 480(x7=%ymm13 +vmovupd 480(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 256 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 512(x0=%ymm6 +vmovupd 512(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 544 ] +# asm 1: vmovupd 544(x1=reg256#8 +# asm 2: vmovupd 544(x1=%ymm7 +vmovupd 544(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 576 ] +# asm 1: vmovupd 576(x2=reg256#9 +# asm 2: vmovupd 576(x2=%ymm8 +vmovupd 576(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 608 ] +# asm 1: vmovupd 608(x3=reg256#10 +# asm 2: vmovupd 608(x3=%ymm9 +vmovupd 608(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 640 ] +# asm 1: vmovupd 640(x4=reg256#11 +# asm 2: vmovupd 640(x4=%ymm10 +vmovupd 640(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 672 ] +# asm 1: vmovupd 672(x5=reg256#12 +# asm 2: vmovupd 672(x5=%ymm11 +vmovupd 672(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 704 ] +# asm 1: vmovupd 704(x6=reg256#13 +# asm 2: vmovupd 704(x6=%ymm12 +vmovupd 704(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 736 ] +# asm 1: vmovupd 736(x7=reg256#14 +# asm 2: vmovupd 736(x7=%ymm13 +vmovupd 736(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 512 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 768(x0=%ymm6 +vmovupd 768(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 800 ] +# asm 1: vmovupd 800(x1=reg256#8 +# asm 2: vmovupd 800(x1=%ymm7 +vmovupd 800(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 832 ] +# asm 1: vmovupd 832(x2=reg256#9 +# asm 2: vmovupd 832(x2=%ymm8 +vmovupd 832(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 864 ] +# asm 1: vmovupd 864(x3=reg256#10 +# asm 2: vmovupd 864(x3=%ymm9 +vmovupd 864(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 896 ] +# asm 1: vmovupd 896(x4=reg256#11 +# asm 2: vmovupd 896(x4=%ymm10 +vmovupd 896(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 928 ] +# asm 1: vmovupd 928(x5=reg256#12 +# asm 2: vmovupd 928(x5=%ymm11 +vmovupd 928(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 960 ] +# asm 1: vmovupd 960(x6=reg256#13 +# asm 2: vmovupd 960(x6=%ymm12 +vmovupd 960(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 992 ] +# asm 1: vmovupd 992(x7=reg256#14 +# asm 2: vmovupd 992(x7=%ymm13 +vmovupd 992(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 768 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1024(x0=%ymm6 +vmovupd 1024(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1056 ] +# asm 1: vmovupd 1056(x1=reg256#8 +# asm 2: vmovupd 1056(x1=%ymm7 +vmovupd 1056(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1088 ] +# asm 1: vmovupd 1088(x2=reg256#9 +# asm 2: vmovupd 1088(x2=%ymm8 +vmovupd 1088(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1120 ] +# asm 1: vmovupd 1120(x3=reg256#10 +# asm 2: vmovupd 1120(x3=%ymm9 +vmovupd 1120(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1152 ] +# asm 1: vmovupd 1152(x4=reg256#11 +# asm 2: vmovupd 1152(x4=%ymm10 +vmovupd 1152(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1184 ] +# asm 1: vmovupd 1184(x5=reg256#12 +# asm 2: vmovupd 1184(x5=%ymm11 +vmovupd 1184(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1216 ] +# asm 1: vmovupd 1216(x6=reg256#13 +# asm 2: vmovupd 1216(x6=%ymm12 +vmovupd 1216(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1248 ] +# asm 1: vmovupd 1248(x7=reg256#14 +# asm 2: vmovupd 1248(x7=%ymm13 +vmovupd 1248(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1024 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1280(x0=%ymm6 +vmovupd 1280(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1312 ] +# asm 1: vmovupd 1312(x1=reg256#8 +# asm 2: vmovupd 1312(x1=%ymm7 +vmovupd 1312(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1344 ] +# asm 1: vmovupd 1344(x2=reg256#9 +# asm 2: vmovupd 1344(x2=%ymm8 +vmovupd 1344(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1376 ] +# asm 1: vmovupd 1376(x3=reg256#10 +# asm 2: vmovupd 1376(x3=%ymm9 +vmovupd 1376(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1408 ] +# asm 1: vmovupd 1408(x4=reg256#11 +# asm 2: vmovupd 1408(x4=%ymm10 +vmovupd 1408(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1440 ] +# asm 1: vmovupd 1440(x5=reg256#12 +# asm 2: vmovupd 1440(x5=%ymm11 +vmovupd 1440(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1472 ] +# asm 1: vmovupd 1472(x6=reg256#13 +# asm 2: vmovupd 1472(x6=%ymm12 +vmovupd 1472(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1504 ] +# asm 1: vmovupd 1504(x7=reg256#14 +# asm 2: vmovupd 1504(x7=%ymm13 +vmovupd 1504(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1280 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1536(x0=%ymm6 +vmovupd 1536(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1568 ] +# asm 1: vmovupd 1568(x1=reg256#8 +# asm 2: vmovupd 1568(x1=%ymm7 +vmovupd 1568(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1600 ] +# asm 1: vmovupd 1600(x2=reg256#9 +# asm 2: vmovupd 1600(x2=%ymm8 +vmovupd 1600(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1632 ] +# asm 1: vmovupd 1632(x3=reg256#10 +# asm 2: vmovupd 1632(x3=%ymm9 +vmovupd 1632(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1664 ] +# asm 1: vmovupd 1664(x4=reg256#11 +# asm 2: vmovupd 1664(x4=%ymm10 +vmovupd 1664(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1696 ] +# asm 1: vmovupd 1696(x5=reg256#12 +# asm 2: vmovupd 1696(x5=%ymm11 +vmovupd 1696(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1728 ] +# asm 1: vmovupd 1728(x6=reg256#13 +# asm 2: vmovupd 1728(x6=%ymm12 +vmovupd 1728(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 1760 ] +# asm 1: vmovupd 1760(x7=reg256#14 +# asm 2: vmovupd 1760(x7=%ymm13 +vmovupd 1760(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm13,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#14 +# asm 2: vpand v11=%ymm13 +vpand %ymm13,%ymm1,%ymm13 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#13 +# asm 2: vpor x3=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#10 +# asm 2: vpor x7=%ymm9 +vpor %ymm9,%ymm13,%ymm9 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#14 +# asm 2: vpand v00=%ymm13 +vpand %ymm14,%ymm2,%ymm13 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#15 +# asm 2: vpand v01=%ymm14 +vpand %ymm14,%ymm3,%ymm14 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#14 +# asm 2: vpor x0=%ymm13 +vpor %ymm13,%ymm15,%ymm13 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm14,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm10,%ymm2,%ymm14 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm3,%ymm12 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#15 +# asm 2: vpor x1=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm10,%ymm12,%ymm10 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm2,%ymm12 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#13 +# asm 2: vpor x4=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm9,%ymm2,%ymm15 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm9,%ymm3,%ymm9 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm8,%ymm15,%ymm8 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#8 +# asm 2: vpor x7=%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm14,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm13,%ymm5,%ymm13 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#15 +# asm 2: vpand v11=%ymm14 +vpand %ymm14,%ymm5,%ymm14 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm15,%ymm9 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#14 +# asm 2: vpor x1=%ymm13 +vpor %ymm13,%ymm14,%ymm13 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm11,%ymm4,%ymm14 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#15 +# asm 2: vpor x2=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#11 +# asm 2: vpor x3=%ymm10 +vpor %ymm11,%ymm10,%ymm10 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm12,%ymm4,%ymm11 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm8,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#13 +# asm 2: vpand v01=%ymm12 +vpand %ymm12,%ymm5,%ymm12 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm5,%ymm8 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#12 +# asm 2: vpor x4=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#9 +# asm 2: vpor x5=%ymm8 +vpor %ymm12,%ymm8,%ymm8 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm6,%ymm4,%ymm12 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm7,%ymm4,%ymm15 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#8 +# asm 2: vpand v11=%ymm7 +vpand %ymm7,%ymm5,%ymm7 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#13 +# asm 2: vpor x6=%ymm12 +vpor %ymm12,%ymm15,%ymm12 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#7 +# asm 2: vpor x7=%ymm6 +vpor %ymm6,%ymm7,%ymm6 + +# qhasm: mem256[ input_0 + 1536 ] = x0 +# asm 1: vmovupd x0=reg256#7 +# asm 2: vmovupd 1792(x0=%ymm6 +vmovupd 1792(%rdi),%ymm6 + +# qhasm: x1 = mem256[ input_0 + 1824 ] +# asm 1: vmovupd 1824(x1=reg256#8 +# asm 2: vmovupd 1824(x1=%ymm7 +vmovupd 1824(%rdi),%ymm7 + +# qhasm: x2 = mem256[ input_0 + 1856 ] +# asm 1: vmovupd 1856(x2=reg256#9 +# asm 2: vmovupd 1856(x2=%ymm8 +vmovupd 1856(%rdi),%ymm8 + +# qhasm: x3 = mem256[ input_0 + 1888 ] +# asm 1: vmovupd 1888(x3=reg256#10 +# asm 2: vmovupd 1888(x3=%ymm9 +vmovupd 1888(%rdi),%ymm9 + +# qhasm: x4 = mem256[ input_0 + 1920 ] +# asm 1: vmovupd 1920(x4=reg256#11 +# asm 2: vmovupd 1920(x4=%ymm10 +vmovupd 1920(%rdi),%ymm10 + +# qhasm: x5 = mem256[ input_0 + 1952 ] +# asm 1: vmovupd 1952(x5=reg256#12 +# asm 2: vmovupd 1952(x5=%ymm11 +vmovupd 1952(%rdi),%ymm11 + +# qhasm: x6 = mem256[ input_0 + 1984 ] +# asm 1: vmovupd 1984(x6=reg256#13 +# asm 2: vmovupd 1984(x6=%ymm12 +vmovupd 1984(%rdi),%ymm12 + +# qhasm: x7 = mem256[ input_0 + 2016 ] +# asm 1: vmovupd 2016(x7=reg256#14 +# asm 2: vmovupd 2016(x7=%ymm13 +vmovupd 2016(%rdi),%ymm13 + +# qhasm: v00 = x0 & mask0 +# asm 1: vpand v00=reg256#15 +# asm 2: vpand v00=%ymm14 +vpand %ymm6,%ymm0,%ymm14 + +# qhasm: v10 = x4 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm10,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm1,%ymm6 + +# qhasm: v11 = x4 & mask1 +# asm 1: vpand v11=reg256#11 +# asm 2: vpand v11=%ymm10 +vpand %ymm10,%ymm1,%ymm10 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x0=reg256#15 +# asm 2: vpor x0=%ymm14 +vpor %ymm14,%ymm15,%ymm14 + +# qhasm: x4 = v01 | v11 +# asm 1: vpor x4=reg256#7 +# asm 2: vpor x4=%ymm6 +vpor %ymm6,%ymm10,%ymm6 + +# qhasm: v00 = x1 & mask0 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm7,%ymm0,%ymm10 + +# qhasm: v10 = x5 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm11,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm1,%ymm7 + +# qhasm: v11 = x5 & mask1 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm1,%ymm11 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x1=reg256#11 +# asm 2: vpor x1=%ymm10 +vpor %ymm10,%ymm15,%ymm10 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#8 +# asm 2: vpor x5=%ymm7 +vpor %ymm7,%ymm11,%ymm7 + +# qhasm: v00 = x2 & mask0 +# asm 1: vpand v00=reg256#12 +# asm 2: vpand v00=%ymm11 +vpand %ymm8,%ymm0,%ymm11 + +# qhasm: v10 = x6 & mask0 +# asm 1: vpand v10=reg256#16 +# asm 2: vpand v10=%ymm15 +vpand %ymm12,%ymm0,%ymm15 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm8,%ymm1,%ymm8 + +# qhasm: v11 = x6 & mask1 +# asm 1: vpand v11=reg256#13 +# asm 2: vpand v11=%ymm12 +vpand %ymm12,%ymm1,%ymm12 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm11,%ymm15,%ymm11 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#9 +# asm 2: vpor x6=%ymm8 +vpor %ymm8,%ymm12,%ymm8 + +# qhasm: v00 = x3 & mask0 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm9,%ymm0,%ymm12 + +# qhasm: v10 = x7 & mask0 +# asm 1: vpand v10=reg256#1 +# asm 2: vpand v10=%ymm0 +vpand %ymm13,%ymm0,%ymm0 + +# qhasm: 4x v10 <<= 4 +# asm 1: vpsllq $4,v01=reg256#10 +# asm 2: vpand v01=%ymm9 +vpand %ymm9,%ymm1,%ymm9 + +# qhasm: v11 = x7 & mask1 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: 4x v01 unsigned>>= 4 +# asm 1: vpsrlq $4,x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm12,%ymm0,%ymm0 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm9,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask2 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm14,%ymm2,%ymm9 + +# qhasm: v10 = x2 & mask2 +# asm 1: vpand v10=reg256#13 +# asm 2: vpand v10=%ymm12 +vpand %ymm11,%ymm2,%ymm12 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#14 +# asm 2: vpand v01=%ymm13 +vpand %ymm14,%ymm3,%ymm13 + +# qhasm: v11 = x2 & mask3 +# asm 1: vpand v11=reg256#12 +# asm 2: vpand v11=%ymm11 +vpand %ymm11,%ymm3,%ymm11 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x0=reg256#10 +# asm 2: vpor x0=%ymm9 +vpor %ymm9,%ymm12,%ymm9 + +# qhasm: x2 = v01 | v11 +# asm 1: vpor x2=reg256#12 +# asm 2: vpor x2=%ymm11 +vpor %ymm13,%ymm11,%ymm11 + +# qhasm: v00 = x1 & mask2 +# asm 1: vpand v00=reg256#13 +# asm 2: vpand v00=%ymm12 +vpand %ymm10,%ymm2,%ymm12 + +# qhasm: v10 = x3 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm0,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm3,%ymm10 + +# qhasm: v11 = x3 & mask3 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm3,%ymm0 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x1=reg256#13 +# asm 2: vpor x1=%ymm12 +vpor %ymm12,%ymm13,%ymm12 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm10,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask2 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm2,%ymm10 + +# qhasm: v10 = x6 & mask2 +# asm 1: vpand v10=reg256#14 +# asm 2: vpand v10=%ymm13 +vpand %ymm8,%ymm2,%ymm13 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm3,%ymm6 + +# qhasm: v11 = x6 & mask3 +# asm 1: vpand v11=reg256#9 +# asm 2: vpand v11=%ymm8 +vpand %ymm8,%ymm3,%ymm8 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x4=reg256#11 +# asm 2: vpor x4=%ymm10 +vpor %ymm10,%ymm13,%ymm10 + +# qhasm: x6 = v01 | v11 +# asm 1: vpor x6=reg256#7 +# asm 2: vpor x6=%ymm6 +vpor %ymm6,%ymm8,%ymm6 + +# qhasm: v00 = x5 & mask2 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm7,%ymm2,%ymm8 + +# qhasm: v10 = x7 & mask2 +# asm 1: vpand v10=reg256#3 +# asm 2: vpand v10=%ymm2 +vpand %ymm1,%ymm2,%ymm2 + +# qhasm: 4x v10 <<= 2 +# asm 1: vpsllq $2,v01=reg256#8 +# asm 2: vpand v01=%ymm7 +vpand %ymm7,%ymm3,%ymm7 + +# qhasm: v11 = x7 & mask3 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm3,%ymm1 + +# qhasm: 4x v01 unsigned>>= 2 +# asm 1: vpsrlq $2,x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm8,%ymm2,%ymm2 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm7,%ymm1,%ymm1 + +# qhasm: v00 = x0 & mask4 +# asm 1: vpand v00=reg256#4 +# asm 2: vpand v00=%ymm3 +vpand %ymm9,%ymm4,%ymm3 + +# qhasm: v10 = x1 & mask4 +# asm 1: vpand v10=reg256#8 +# asm 2: vpand v10=%ymm7 +vpand %ymm12,%ymm4,%ymm7 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#9 +# asm 2: vpand v01=%ymm8 +vpand %ymm9,%ymm5,%ymm8 + +# qhasm: v11 = x1 & mask5 +# asm 1: vpand v11=reg256#10 +# asm 2: vpand v11=%ymm9 +vpand %ymm12,%ymm5,%ymm9 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x0=reg256#4 +# asm 2: vpor x0=%ymm3 +vpor %ymm3,%ymm7,%ymm3 + +# qhasm: x1 = v01 | v11 +# asm 1: vpor x1=reg256#8 +# asm 2: vpor x1=%ymm7 +vpor %ymm8,%ymm9,%ymm7 + +# qhasm: v00 = x2 & mask4 +# asm 1: vpand v00=reg256#9 +# asm 2: vpand v00=%ymm8 +vpand %ymm11,%ymm4,%ymm8 + +# qhasm: v10 = x3 & mask4 +# asm 1: vpand v10=reg256#10 +# asm 2: vpand v10=%ymm9 +vpand %ymm0,%ymm4,%ymm9 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#12 +# asm 2: vpand v01=%ymm11 +vpand %ymm11,%ymm5,%ymm11 + +# qhasm: v11 = x3 & mask5 +# asm 1: vpand v11=reg256#1 +# asm 2: vpand v11=%ymm0 +vpand %ymm0,%ymm5,%ymm0 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x2=reg256#9 +# asm 2: vpor x2=%ymm8 +vpor %ymm8,%ymm9,%ymm8 + +# qhasm: x3 = v01 | v11 +# asm 1: vpor x3=reg256#1 +# asm 2: vpor x3=%ymm0 +vpor %ymm11,%ymm0,%ymm0 + +# qhasm: v00 = x4 & mask4 +# asm 1: vpand v00=reg256#10 +# asm 2: vpand v00=%ymm9 +vpand %ymm10,%ymm4,%ymm9 + +# qhasm: v10 = x5 & mask4 +# asm 1: vpand v10=reg256#12 +# asm 2: vpand v10=%ymm11 +vpand %ymm2,%ymm4,%ymm11 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#11 +# asm 2: vpand v01=%ymm10 +vpand %ymm10,%ymm5,%ymm10 + +# qhasm: v11 = x5 & mask5 +# asm 1: vpand v11=reg256#3 +# asm 2: vpand v11=%ymm2 +vpand %ymm2,%ymm5,%ymm2 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x4=reg256#10 +# asm 2: vpor x4=%ymm9 +vpor %ymm9,%ymm11,%ymm9 + +# qhasm: x5 = v01 | v11 +# asm 1: vpor x5=reg256#3 +# asm 2: vpor x5=%ymm2 +vpor %ymm10,%ymm2,%ymm2 + +# qhasm: v00 = x6 & mask4 +# asm 1: vpand v00=reg256#11 +# asm 2: vpand v00=%ymm10 +vpand %ymm6,%ymm4,%ymm10 + +# qhasm: v10 = x7 & mask4 +# asm 1: vpand v10=reg256#5 +# asm 2: vpand v10=%ymm4 +vpand %ymm1,%ymm4,%ymm4 + +# qhasm: 4x v10 <<= 1 +# asm 1: vpsllq $1,v01=reg256#7 +# asm 2: vpand v01=%ymm6 +vpand %ymm6,%ymm5,%ymm6 + +# qhasm: v11 = x7 & mask5 +# asm 1: vpand v11=reg256#2 +# asm 2: vpand v11=%ymm1 +vpand %ymm1,%ymm5,%ymm1 + +# qhasm: 4x v01 unsigned>>= 1 +# asm 1: vpsrlq $1,x6=reg256#5 +# asm 2: vpor x6=%ymm4 +vpor %ymm10,%ymm4,%ymm4 + +# qhasm: x7 = v01 | v11 +# asm 1: vpor x7=reg256#2 +# asm 2: vpor x7=%ymm1 +vpor %ymm6,%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 1792 ] = x0 +# asm 1: vmovupd +#include + +void PQCLEAN_MCELIECE8192128F_AVX_uint32_sort(uint32_t *x, size_t n); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/update_asm.S b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/update_asm.S new file mode 100644 index 0000000000..89cccfdfb4 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/update_asm.S @@ -0,0 +1,576 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: int64 s0 + +# qhasm: int64 s1 + +# qhasm: int64 s2 + +# qhasm: enter update_asm +.p2align 5 +.global _PQCLEAN_MCELIECE8192128F_AVX_update_asm +.global PQCLEAN_MCELIECE8192128F_AVX_update_asm +_PQCLEAN_MCELIECE8192128F_AVX_update_asm: +PQCLEAN_MCELIECE8192128F_AVX_update_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: s2 = input_1 +# asm 1: mov s2=int64#2 +# asm 2: mov s2=%rsi +mov %rsi,%rsi + +# qhasm: s0 = mem64[ input_0 + 0 ] +# asm 1: movq 0(s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1,s0=int64#4 +# asm 2: movq 0(s0=%rcx +movq 0(%rdi),%rcx + +# qhasm: s1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(s1=int64#5 +# asm 2: movq 8(s1=%r8 +movq 8(%rdi),%r8 + +# qhasm: s0 = (s1 s0) >> 1 +# asm 1: shrd $1,> 1 +# asm 1: shrd $1,>= 1 +# asm 1: shr $1, + +void PQCLEAN_MCELIECE8192128F_AVX_store_i(unsigned char *out, uint64_t in, int i) { + for (int j = 0; j < i; j++) { + out[j] = (in >> (j * 8)) & 0xFF; + } +} + +void PQCLEAN_MCELIECE8192128F_AVX_store2(unsigned char *dest, uint16_t a) { + dest[0] = a & 0xFF; + dest[1] = a >> 8; +} + +uint16_t PQCLEAN_MCELIECE8192128F_AVX_load2(const unsigned char *src) { + uint16_t a; + + a = src[1]; + a <<= 8; + a |= src[0]; + + return a & GFMASK; +} + +uint32_t PQCLEAN_MCELIECE8192128F_AVX_load4(const unsigned char *src) { + uint32_t a; + + a = src[3]; + a <<= 8; + a |= src[2]; + a <<= 8; + a |= src[1]; + a <<= 8; + a |= src[0]; + + return a; +} + +void PQCLEAN_MCELIECE8192128F_AVX_irr_load(vec128 *out, const unsigned char *in) { + int i, j; + uint64_t v0 = 0, v1 = 0; + uint16_t irr[ SYS_T ]; + + for (i = 0; i < SYS_T; i++) { + irr[i] = PQCLEAN_MCELIECE8192128F_AVX_load2(in + i * 2); + irr[i] &= GFMASK; + } + + for (i = 0; i < GFBITS; i++) { + for (j = 63; j >= 0; j--) { + v0 <<= 1; + v1 <<= 1; + v0 |= (irr[j] >> i) & 1; + v1 |= (irr[j + 64] >> i) & 1; + } + + out[i] = PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(v0, v1); + } +} + +void PQCLEAN_MCELIECE8192128F_AVX_store8(unsigned char *out, uint64_t in) { + out[0] = (in >> 0x00) & 0xFF; + out[1] = (in >> 0x08) & 0xFF; + out[2] = (in >> 0x10) & 0xFF; + out[3] = (in >> 0x18) & 0xFF; + out[4] = (in >> 0x20) & 0xFF; + out[5] = (in >> 0x28) & 0xFF; + out[6] = (in >> 0x30) & 0xFF; + out[7] = (in >> 0x38) & 0xFF; +} + +uint64_t PQCLEAN_MCELIECE8192128F_AVX_load8(const unsigned char *in) { + int i; + uint64_t ret = in[7]; + + for (i = 6; i >= 0; i--) { + ret <<= 8; + ret |= in[i]; + } + + return ret; +} + +vec128 PQCLEAN_MCELIECE8192128F_AVX_load16(const unsigned char *in) { + return PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x( PQCLEAN_MCELIECE8192128F_AVX_load8(in), PQCLEAN_MCELIECE8192128F_AVX_load8(in + 8) ); +} + +void PQCLEAN_MCELIECE8192128F_AVX_store16(unsigned char *out, vec128 in) { + PQCLEAN_MCELIECE8192128F_AVX_store8(out + 0, PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(in, 0)); + PQCLEAN_MCELIECE8192128F_AVX_store8(out + 8, PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(in, 1)); +} diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/util.h b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/util.h new file mode 100644 index 0000000000..6cf3988d41 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/util.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_UTIL_H +#define PQCLEAN_MCELIECE8192128F_AVX_UTIL_H +/* + This file is for loading/storing data in a little-endian fashion +*/ + + +#include "vec128.h" + +#include + +void PQCLEAN_MCELIECE8192128F_AVX_store_i(unsigned char *out, uint64_t in, int i); +void PQCLEAN_MCELIECE8192128F_AVX_store2(unsigned char *dest, uint16_t a); +uint16_t PQCLEAN_MCELIECE8192128F_AVX_load2(const unsigned char *src); +uint32_t PQCLEAN_MCELIECE8192128F_AVX_load4(const unsigned char *src); +void PQCLEAN_MCELIECE8192128F_AVX_irr_load(vec128 *out, const unsigned char *in); +void PQCLEAN_MCELIECE8192128F_AVX_store8(unsigned char *out, uint64_t in); +uint64_t PQCLEAN_MCELIECE8192128F_AVX_load8(const unsigned char *in); +vec128 PQCLEAN_MCELIECE8192128F_AVX_load16(const unsigned char *in); + +void PQCLEAN_MCELIECE8192128F_AVX_store16(unsigned char *out, vec128 in); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/vec128.c b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/vec128.c new file mode 100644 index 0000000000..1d1e151663 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/vec128.c @@ -0,0 +1,79 @@ +#include "vec128.h" + +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_set1_16b(uint16_t a) { + return _mm_set1_epi16(a); +} + +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_setzero(void) { + return _mm_setzero_si128(); +} + +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_and(vec128 a, vec128 b) { + return _mm_and_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(vec128 a, vec128 b) { + return _mm_xor_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_or(vec128 a, vec128 b) { + return _mm_or_si128(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_sll_2x(vec128 a, int s) { + return _mm_slli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_srl_2x(vec128 a, int s) { + return _mm_srli_epi64(a, s); +} + +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(uint64_t a0, uint64_t a1) { + return _mm_set_epi64x(a1, a0); +} + +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_low(vec128 a, vec128 b) { + return _mm_unpacklo_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_high(vec128 a, vec128 b) { + return _mm_unpackhi_epi64(a, b); +} + +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_setbits(uint64_t a) { + return _mm_set1_epi64x(-a); +} + +void PQCLEAN_MCELIECE8192128F_AVX_vec128_copy(vec128 *dest, const vec128 *src) { + int i; + + for (i = 0; i < GFBITS; i++) { + dest[i] = src[i]; + } +} + +void PQCLEAN_MCELIECE8192128F_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b) { + int i; + + for (i = 0; i < GFBITS; i++) { + c[i] = PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(a[i], b[i]); + } +} + +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_or_reduce(const vec128 *a) { + int i; + vec128 ret; + + ret = a[0]; + for (i = 1; i < GFBITS; i++) { + ret = PQCLEAN_MCELIECE8192128F_AVX_vec128_or(ret, a[i]); + } + + return ret; +} + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE8192128F_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g) { + PQCLEAN_MCELIECE8192128F_AVX_vec128_mul_asm(h, f, g, 16); +} + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/vec128.h b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/vec128.h new file mode 100644 index 0000000000..c1aa6dca95 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/vec128.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_MCELIECE8192128F_AVX_VEC128_H +#define PQCLEAN_MCELIECE8192128F_AVX_VEC128_H +/* + This file is for functions related to 128-bit vectors + including functions for bitsliced field operations +*/ + + +#include "params.h" + +#include +#include + +typedef __m128i vec128; + +// this needs to be a macro, because +// _mm_extract_epi64 requires a literal int argument. +#define PQCLEAN_MCELIECE8192128F_AVX_vec128_extract(a, i) ((uint64_t) _mm_extract_epi64((vec128) (a), (i))) + +int PQCLEAN_MCELIECE8192128F_AVX_vec128_testz(vec128 a); +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_set1_16b(uint16_t a); +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_setzero(void); +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_and(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_xor(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_or(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_sll_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_srl_2x(vec128 a, int s); +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_set2x(uint64_t a0, uint64_t a1); +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_low(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_unpack_high(vec128 a, vec128 b); +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_setbits(uint64_t a); +void PQCLEAN_MCELIECE8192128F_AVX_vec128_copy(vec128 *dest, const vec128 *src); +void PQCLEAN_MCELIECE8192128F_AVX_vec128_add(vec128 *c, const vec128 *a, const vec128 *b); +vec128 PQCLEAN_MCELIECE8192128F_AVX_vec128_or_reduce(const vec128 *a); + +extern void PQCLEAN_MCELIECE8192128F_AVX_vec128_mul_asm(vec128 *, vec128 *, const vec128 *, int); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE8192128F_AVX_vec128_mul(vec128 *h, vec128 *f, const vec128 *g); + +#endif diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/vec128_mul_asm.S b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/vec128_mul_asm.S new file mode 100644 index 0000000000..77172c3b3c --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/vec128_mul_asm.S @@ -0,0 +1,1816 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 b2 + +# qhasm: reg256 b3 + +# qhasm: reg256 b4 + +# qhasm: reg256 b5 + +# qhasm: reg256 b6 + +# qhasm: reg256 b7 + +# qhasm: reg256 b8 + +# qhasm: reg256 b9 + +# qhasm: reg256 b10 + +# qhasm: reg256 b11 + +# qhasm: reg256 b12 + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: reg128 h0 + +# qhasm: reg128 h1 + +# qhasm: reg128 h2 + +# qhasm: reg128 h3 + +# qhasm: reg128 h4 + +# qhasm: reg128 h5 + +# qhasm: reg128 h6 + +# qhasm: reg128 h7 + +# qhasm: reg128 h8 + +# qhasm: reg128 h9 + +# qhasm: reg128 h10 + +# qhasm: reg128 h11 + +# qhasm: reg128 h12 + +# qhasm: reg128 h13 + +# qhasm: reg128 h14 + +# qhasm: reg128 h15 + +# qhasm: reg128 h16 + +# qhasm: reg128 h17 + +# qhasm: reg128 h18 + +# qhasm: reg128 h19 + +# qhasm: reg128 h20 + +# qhasm: reg128 h21 + +# qhasm: reg128 h22 + +# qhasm: reg128 h23 + +# qhasm: reg128 h24 + +# qhasm: stack4864 buf + +# qhasm: int64 ptr + +# qhasm: int64 tmp + +# qhasm: enter vec128_mul_asm +.p2align 5 +.global _PQCLEAN_MCELIECE8192128F_AVX_vec128_mul_asm +.global PQCLEAN_MCELIECE8192128F_AVX_vec128_mul_asm +_PQCLEAN_MCELIECE8192128F_AVX_vec128_mul_asm: +PQCLEAN_MCELIECE8192128F_AVX_vec128_mul_asm: +mov %rsp,%r11 +and $31,%r11 +add $608,%r11 +sub %r11,%rsp + +# qhasm: ptr = &buf +# asm 1: leaq ptr=int64#5 +# asm 2: leaq ptr=%r8 +leaq 0(%rsp),%r8 + +# qhasm: tmp = input_3 +# asm 1: mov tmp=int64#6 +# asm 2: mov tmp=%r9 +mov %rcx,%r9 + +# qhasm: tmp *= 12 +# asm 1: imulq $12,tmp=int64#6 +# asm 2: imulq $12,tmp=%r9 +imulq $12,%r9,%r9 + +# qhasm: input_2 += tmp +# asm 1: add b12=reg256#1 +# asm 2: vbroadcasti128 0(b12=%ymm0 +vbroadcasti128 0(%rdx), %ymm0 + +# qhasm: input_2 -= input_3 +# asm 1: sub a6=reg256#2 +# asm 2: vpxor a6=%ymm1 +vpxor %ymm1,%ymm1,%ymm1 + +# qhasm: a6[0] = mem128[ input_1 + 96 ] +# asm 1: vinsertf128 $0x0,96(r18=reg256#3 +# asm 2: vpand r18=%ymm2 +vpand %ymm0,%ymm1,%ymm2 + +# qhasm: mem256[ ptr + 576 ] = r18 +# asm 1: vmovupd r17=reg256#4 +# asm 2: vpand r17=%ymm3 +vpand %ymm0,%ymm2,%ymm3 + +# qhasm: a4[0] = mem128[ input_1 + 64 ] +# asm 1: vinsertf128 $0x0,64(r16=reg256#6 +# asm 2: vpand r16=%ymm5 +vpand %ymm0,%ymm4,%ymm5 + +# qhasm: a3[0] = mem128[ input_1 + 48 ] +# asm 1: vinsertf128 $0x0,48(r15=reg256#8 +# asm 2: vpand r15=%ymm7 +vpand %ymm0,%ymm6,%ymm7 + +# qhasm: a2[0] = mem128[ input_1 + 32 ] +# asm 1: vinsertf128 $0x0,32(r14=reg256#10 +# asm 2: vpand r14=%ymm9 +vpand %ymm0,%ymm8,%ymm9 + +# qhasm: a1[0] = mem128[ input_1 + 16 ] +# asm 1: vinsertf128 $0x0,16(r13=reg256#12 +# asm 2: vpand r13=%ymm11 +vpand %ymm0,%ymm10,%ymm11 + +# qhasm: a0[0] = mem128[ input_1 + 0 ] +# asm 1: vinsertf128 $0x0,0(r12=reg256#1 +# asm 2: vpand r12=%ymm0 +vpand %ymm0,%ymm12,%ymm0 + +# qhasm: b11 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b11=reg256#14 +# asm 2: vbroadcasti128 0(b11=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r12 ^= r +# asm 1: vpxor r11=reg256#4 +# asm 2: vpand r11=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b10 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b10=reg256#14 +# asm 2: vbroadcasti128 0(b10=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r11 ^= r +# asm 1: vpxor r10=reg256#6 +# asm 2: vpand r10=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b9 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b9=reg256#14 +# asm 2: vbroadcasti128 0(b9=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r10 ^= r +# asm 1: vpxor r9=reg256#8 +# asm 2: vpand r9=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b8 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b8=reg256#14 +# asm 2: vbroadcasti128 0(b8=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r9 ^= r +# asm 1: vpxor r8=reg256#10 +# asm 2: vpand r8=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b7 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b7=reg256#14 +# asm 2: vbroadcasti128 0(b7=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r8 ^= r +# asm 1: vpxor r7=reg256#12 +# asm 2: vpand r7=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b6 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b6=reg256#14 +# asm 2: vbroadcasti128 0(b6=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r6=reg256#1 +# asm 2: vpand r6=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: b5 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b5=reg256#14 +# asm 2: vbroadcasti128 0(b5=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm2,%ymm3 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm4,%ymm3 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm6,%ymm3 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm8,%ymm3 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#4 +# asm 2: vpand r=%ymm3 +vpand %ymm13,%ymm10,%ymm3 + +# qhasm: r6 ^= r +# asm 1: vpxor r5=reg256#4 +# asm 2: vpand r5=%ymm3 +vpand %ymm13,%ymm12,%ymm3 + +# qhasm: b4 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b4=reg256#14 +# asm 2: vbroadcasti128 0(b4=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm2,%ymm5 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm4,%ymm5 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm6,%ymm5 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm8,%ymm5 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#6 +# asm 2: vpand r=%ymm5 +vpand %ymm13,%ymm10,%ymm5 + +# qhasm: r5 ^= r +# asm 1: vpxor r4=reg256#6 +# asm 2: vpand r4=%ymm5 +vpand %ymm13,%ymm12,%ymm5 + +# qhasm: b3 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b3=reg256#14 +# asm 2: vbroadcasti128 0(b3=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm2,%ymm7 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm4,%ymm7 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm6,%ymm7 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm8,%ymm7 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#8 +# asm 2: vpand r=%ymm7 +vpand %ymm13,%ymm10,%ymm7 + +# qhasm: r4 ^= r +# asm 1: vpxor r3=reg256#8 +# asm 2: vpand r3=%ymm7 +vpand %ymm13,%ymm12,%ymm7 + +# qhasm: b2 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b2=reg256#14 +# asm 2: vbroadcasti128 0(b2=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm2,%ymm9 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm4,%ymm9 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm6,%ymm9 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm8,%ymm9 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#10 +# asm 2: vpand r=%ymm9 +vpand %ymm13,%ymm10,%ymm9 + +# qhasm: r3 ^= r +# asm 1: vpxor r2=reg256#10 +# asm 2: vpand r2=%ymm9 +vpand %ymm13,%ymm12,%ymm9 + +# qhasm: b1 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b1=reg256#14 +# asm 2: vbroadcasti128 0(b1=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#15 +# asm 2: vpand r=%ymm14 +vpand %ymm13,%ymm1,%ymm14 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm2,%ymm11 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm4,%ymm11 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm6,%ymm11 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm8,%ymm11 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#12 +# asm 2: vpand r=%ymm11 +vpand %ymm13,%ymm10,%ymm11 + +# qhasm: r2 ^= r +# asm 1: vpxor r1=reg256#12 +# asm 2: vpand r1=%ymm11 +vpand %ymm13,%ymm12,%ymm11 + +# qhasm: b0 = mem128[ input_2 + 0 ] x2 +# asm 1: vbroadcasti128 0(b0=reg256#14 +# asm 2: vbroadcasti128 0(b0=%ymm13 +vbroadcasti128 0(%rdx), %ymm13 + +# qhasm: input_2 -= input_3 +# asm 1: sub r=reg256#2 +# asm 2: vpand r=%ymm1 +vpand %ymm13,%ymm1,%ymm1 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm2,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm4,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm6,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm8,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm13,%ymm10,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r0=reg256#1 +# asm 2: vpand r0=%ymm0 +vpand %ymm13,%ymm12,%ymm0 + +# qhasm: mem256[ ptr + 160 ] = r5 +# asm 1: vmovupd h24=reg128#1 +# asm 2: movdqu 560(h24=%xmm0 +movdqu 560(%r8),%xmm0 + +# qhasm: h11 = h24 +# asm 1: movdqa h11=reg128#2 +# asm 2: movdqa h11=%xmm1 +movdqa %xmm0,%xmm1 + +# qhasm: h12 = h24 +# asm 1: movdqa h12=reg128#3 +# asm 2: movdqa h12=%xmm2 +movdqa %xmm0,%xmm2 + +# qhasm: h14 = h24 +# asm 1: movdqa h14=reg128#4 +# asm 2: movdqa h14=%xmm3 +movdqa %xmm0,%xmm3 + +# qhasm: h15 = h24 +# asm 1: movdqa h15=reg128#1 +# asm 2: movdqa h15=%xmm0 +movdqa %xmm0,%xmm0 + +# qhasm: h23 = mem128[ ptr + 528 ] +# asm 1: movdqu 528(h23=reg128#5 +# asm 2: movdqu 528(h23=%xmm4 +movdqu 528(%r8),%xmm4 + +# qhasm: h10 = h23 +# asm 1: movdqa h10=reg128#6 +# asm 2: movdqa h10=%xmm5 +movdqa %xmm4,%xmm5 + +# qhasm: h11 = h11 ^ h23 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm4,%xmm1,%xmm1 + +# qhasm: h13 = h23 +# asm 1: movdqa h13=reg128#7 +# asm 2: movdqa h13=%xmm6 +movdqa %xmm4,%xmm6 + +# qhasm: h14 = h14 ^ h23 +# asm 1: vpxor h14=reg128#4 +# asm 2: vpxor h14=%xmm3 +vpxor %xmm4,%xmm3,%xmm3 + +# qhasm: h22 = mem128[ ptr + 496 ] +# asm 1: movdqu 496(h22=reg128#5 +# asm 2: movdqu 496(h22=%xmm4 +movdqu 496(%r8),%xmm4 + +# qhasm: h9 = h22 +# asm 1: movdqa h9=reg128#8 +# asm 2: movdqa h9=%xmm7 +movdqa %xmm4,%xmm7 + +# qhasm: h10 = h10 ^ h22 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm4,%xmm5,%xmm5 + +# qhasm: h12 = h12 ^ h22 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm4,%xmm2,%xmm2 + +# qhasm: h13 = h13 ^ h22 +# asm 1: vpxor h13=reg128#5 +# asm 2: vpxor h13=%xmm4 +vpxor %xmm4,%xmm6,%xmm4 + +# qhasm: h21 = mem128[ ptr + 464 ] +# asm 1: movdqu 464(h21=reg128#7 +# asm 2: movdqu 464(h21=%xmm6 +movdqu 464(%r8),%xmm6 + +# qhasm: h8 = h21 +# asm 1: movdqa h8=reg128#9 +# asm 2: movdqa h8=%xmm8 +movdqa %xmm6,%xmm8 + +# qhasm: h9 = h9 ^ h21 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h11 = h11 ^ h21 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h12 = h12 ^ h21 +# asm 1: vpxor h12=reg128#3 +# asm 2: vpxor h12=%xmm2 +vpxor %xmm6,%xmm2,%xmm2 + +# qhasm: h20 = mem128[ ptr + 432 ] +# asm 1: movdqu 432(h20=reg128#7 +# asm 2: movdqu 432(h20=%xmm6 +movdqu 432(%r8),%xmm6 + +# qhasm: h7 = h20 +# asm 1: movdqa h7=reg128#10 +# asm 2: movdqa h7=%xmm9 +movdqa %xmm6,%xmm9 + +# qhasm: h8 = h8 ^ h20 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h10 = h10 ^ h20 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h11 = h11 ^ h20 +# asm 1: vpxor h11=reg128#2 +# asm 2: vpxor h11=%xmm1 +vpxor %xmm6,%xmm1,%xmm1 + +# qhasm: h19 = mem128[ ptr + 400 ] +# asm 1: movdqu 400(h19=reg128#7 +# asm 2: movdqu 400(h19=%xmm6 +movdqu 400(%r8),%xmm6 + +# qhasm: h6 = h19 +# asm 1: movdqa h6=reg128#11 +# asm 2: movdqa h6=%xmm10 +movdqa %xmm6,%xmm10 + +# qhasm: h7 = h7 ^ h19 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm6,%xmm9,%xmm9 + +# qhasm: h9 = h9 ^ h19 +# asm 1: vpxor h9=reg128#8 +# asm 2: vpxor h9=%xmm7 +vpxor %xmm6,%xmm7,%xmm7 + +# qhasm: h10 = h10 ^ h19 +# asm 1: vpxor h10=reg128#6 +# asm 2: vpxor h10=%xmm5 +vpxor %xmm6,%xmm5,%xmm5 + +# qhasm: h18 = mem128[ ptr + 368 ] +# asm 1: movdqu 368(h18=reg128#7 +# asm 2: movdqu 368(h18=%xmm6 +movdqu 368(%r8),%xmm6 + +# qhasm: h18 = h18 ^ mem128[ ptr + 576 ] +# asm 1: vpxor 576(h18=reg128#7 +# asm 2: vpxor 576(h18=%xmm6 +vpxor 576(%r8),%xmm6,%xmm6 + +# qhasm: h5 = h18 +# asm 1: movdqa h5=reg128#12 +# asm 2: movdqa h5=%xmm11 +movdqa %xmm6,%xmm11 + +# qhasm: h6 = h6 ^ h18 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm6,%xmm10,%xmm10 + +# qhasm: h8 = h8 ^ h18 +# asm 1: vpxor h8=reg128#9 +# asm 2: vpxor h8=%xmm8 +vpxor %xmm6,%xmm8,%xmm8 + +# qhasm: h9 = h9 ^ h18 +# asm 1: vpxor h9=reg128#7 +# asm 2: vpxor h9=%xmm6 +vpxor %xmm6,%xmm7,%xmm6 + +# qhasm: h17 = mem128[ ptr + 336 ] +# asm 1: movdqu 336(h17=reg128#8 +# asm 2: movdqu 336(h17=%xmm7 +movdqu 336(%r8),%xmm7 + +# qhasm: h17 = h17 ^ mem128[ ptr + 544 ] +# asm 1: vpxor 544(h17=reg128#8 +# asm 2: vpxor 544(h17=%xmm7 +vpxor 544(%r8),%xmm7,%xmm7 + +# qhasm: h4 = h17 +# asm 1: movdqa h4=reg128#13 +# asm 2: movdqa h4=%xmm12 +movdqa %xmm7,%xmm12 + +# qhasm: h5 = h5 ^ h17 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm7,%xmm11,%xmm11 + +# qhasm: h7 = h7 ^ h17 +# asm 1: vpxor h7=reg128#10 +# asm 2: vpxor h7=%xmm9 +vpxor %xmm7,%xmm9,%xmm9 + +# qhasm: h8 = h8 ^ h17 +# asm 1: vpxor h8=reg128#8 +# asm 2: vpxor h8=%xmm7 +vpxor %xmm7,%xmm8,%xmm7 + +# qhasm: h16 = mem128[ ptr + 304 ] +# asm 1: movdqu 304(h16=reg128#9 +# asm 2: movdqu 304(h16=%xmm8 +movdqu 304(%r8),%xmm8 + +# qhasm: h16 = h16 ^ mem128[ ptr + 512 ] +# asm 1: vpxor 512(h16=reg128#9 +# asm 2: vpxor 512(h16=%xmm8 +vpxor 512(%r8),%xmm8,%xmm8 + +# qhasm: h3 = h16 +# asm 1: movdqa h3=reg128#14 +# asm 2: movdqa h3=%xmm13 +movdqa %xmm8,%xmm13 + +# qhasm: h4 = h4 ^ h16 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm8,%xmm12,%xmm12 + +# qhasm: h6 = h6 ^ h16 +# asm 1: vpxor h6=reg128#11 +# asm 2: vpxor h6=%xmm10 +vpxor %xmm8,%xmm10,%xmm10 + +# qhasm: h7 = h7 ^ h16 +# asm 1: vpxor h7=reg128#9 +# asm 2: vpxor h7=%xmm8 +vpxor %xmm8,%xmm9,%xmm8 + +# qhasm: h15 = h15 ^ mem128[ ptr + 272 ] +# asm 1: vpxor 272(h15=reg128#1 +# asm 2: vpxor 272(h15=%xmm0 +vpxor 272(%r8),%xmm0,%xmm0 + +# qhasm: h15 = h15 ^ mem128[ ptr + 480 ] +# asm 1: vpxor 480(h15=reg128#1 +# asm 2: vpxor 480(h15=%xmm0 +vpxor 480(%r8),%xmm0,%xmm0 + +# qhasm: h2 = h15 +# asm 1: movdqa h2=reg128#10 +# asm 2: movdqa h2=%xmm9 +movdqa %xmm0,%xmm9 + +# qhasm: h3 = h3 ^ h15 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm0,%xmm13,%xmm13 + +# qhasm: h5 = h5 ^ h15 +# asm 1: vpxor h5=reg128#12 +# asm 2: vpxor h5=%xmm11 +vpxor %xmm0,%xmm11,%xmm11 + +# qhasm: h6 = h6 ^ h15 +# asm 1: vpxor h6=reg128#1 +# asm 2: vpxor h6=%xmm0 +vpxor %xmm0,%xmm10,%xmm0 + +# qhasm: h14 = h14 ^ mem128[ ptr + 240 ] +# asm 1: vpxor 240(h14=reg128#4 +# asm 2: vpxor 240(h14=%xmm3 +vpxor 240(%r8),%xmm3,%xmm3 + +# qhasm: h14 = h14 ^ mem128[ ptr + 448 ] +# asm 1: vpxor 448(h14=reg128#4 +# asm 2: vpxor 448(h14=%xmm3 +vpxor 448(%r8),%xmm3,%xmm3 + +# qhasm: h1 = h14 +# asm 1: movdqa h1=reg128#11 +# asm 2: movdqa h1=%xmm10 +movdqa %xmm3,%xmm10 + +# qhasm: h2 = h2 ^ h14 +# asm 1: vpxor h2=reg128#10 +# asm 2: vpxor h2=%xmm9 +vpxor %xmm3,%xmm9,%xmm9 + +# qhasm: h4 = h4 ^ h14 +# asm 1: vpxor h4=reg128#13 +# asm 2: vpxor h4=%xmm12 +vpxor %xmm3,%xmm12,%xmm12 + +# qhasm: h5 = h5 ^ h14 +# asm 1: vpxor h5=reg128#4 +# asm 2: vpxor h5=%xmm3 +vpxor %xmm3,%xmm11,%xmm3 + +# qhasm: h13 = h13 ^ mem128[ ptr + 208 ] +# asm 1: vpxor 208(h13=reg128#5 +# asm 2: vpxor 208(h13=%xmm4 +vpxor 208(%r8),%xmm4,%xmm4 + +# qhasm: h13 = h13 ^ mem128[ ptr + 416 ] +# asm 1: vpxor 416(h13=reg128#5 +# asm 2: vpxor 416(h13=%xmm4 +vpxor 416(%r8),%xmm4,%xmm4 + +# qhasm: h0 = h13 +# asm 1: movdqa h0=reg128#12 +# asm 2: movdqa h0=%xmm11 +movdqa %xmm4,%xmm11 + +# qhasm: h1 = h1 ^ h13 +# asm 1: vpxor h1=reg128#11 +# asm 2: vpxor h1=%xmm10 +vpxor %xmm4,%xmm10,%xmm10 + +# qhasm: h3 = h3 ^ h13 +# asm 1: vpxor h3=reg128#14 +# asm 2: vpxor h3=%xmm13 +vpxor %xmm4,%xmm13,%xmm13 + +# qhasm: h4 = h4 ^ h13 +# asm 1: vpxor h4=reg128#5 +# asm 2: vpxor h4=%xmm4 +vpxor %xmm4,%xmm12,%xmm4 + +# qhasm: h12 = h12 ^ mem128[ ptr + 384 ] +# asm 1: vpxor 384(h12=reg128#3 +# asm 2: vpxor 384(h12=%xmm2 +vpxor 384(%r8),%xmm2,%xmm2 + +# qhasm: h12 = h12 ^ mem128[ ptr + 176 ] +# asm 1: vpxor 176(h12=reg128#3 +# asm 2: vpxor 176(h12=%xmm2 +vpxor 176(%r8),%xmm2,%xmm2 + +# qhasm: mem128[ input_0 + 192 ] = h12 +# asm 1: movdqu h11=reg128#2 +# asm 2: vpxor 352(h11=%xmm1 +vpxor 352(%r8),%xmm1,%xmm1 + +# qhasm: h11 = h11 ^ mem128[ ptr + 144 ] +# asm 1: vpxor 144(h11=reg128#2 +# asm 2: vpxor 144(h11=%xmm1 +vpxor 144(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 176 ] = h11 +# asm 1: movdqu h10=reg128#2 +# asm 2: vpxor 320(h10=%xmm1 +vpxor 320(%r8),%xmm5,%xmm1 + +# qhasm: h10 = h10 ^ mem128[ ptr + 112 ] +# asm 1: vpxor 112(h10=reg128#2 +# asm 2: vpxor 112(h10=%xmm1 +vpxor 112(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 160 ] = h10 +# asm 1: movdqu h9=reg128#2 +# asm 2: vpxor 288(h9=%xmm1 +vpxor 288(%r8),%xmm6,%xmm1 + +# qhasm: h9 = h9 ^ mem128[ ptr + 80 ] +# asm 1: vpxor 80(h9=reg128#2 +# asm 2: vpxor 80(h9=%xmm1 +vpxor 80(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 144 ] = h9 +# asm 1: movdqu h8=reg128#2 +# asm 2: vpxor 256(h8=%xmm1 +vpxor 256(%r8),%xmm7,%xmm1 + +# qhasm: h8 = h8 ^ mem128[ ptr + 48 ] +# asm 1: vpxor 48(h8=reg128#2 +# asm 2: vpxor 48(h8=%xmm1 +vpxor 48(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 128 ] = h8 +# asm 1: movdqu h7=reg128#2 +# asm 2: vpxor 224(h7=%xmm1 +vpxor 224(%r8),%xmm8,%xmm1 + +# qhasm: h7 = h7 ^ mem128[ ptr + 16 ] +# asm 1: vpxor 16(h7=reg128#2 +# asm 2: vpxor 16(h7=%xmm1 +vpxor 16(%r8),%xmm1,%xmm1 + +# qhasm: mem128[ input_0 + 112 ] = h7 +# asm 1: movdqu h6=reg128#1 +# asm 2: vpxor 192(h6=%xmm0 +vpxor 192(%r8),%xmm0,%xmm0 + +# qhasm: mem128[ input_0 + 96 ] = h6 +# asm 1: movdqu h5=reg128#1 +# asm 2: vpxor 160(h5=%xmm0 +vpxor 160(%r8),%xmm3,%xmm0 + +# qhasm: mem128[ input_0 + 80 ] = h5 +# asm 1: movdqu h4=reg128#1 +# asm 2: vpxor 128(h4=%xmm0 +vpxor 128(%r8),%xmm4,%xmm0 + +# qhasm: mem128[ input_0 + 64 ] = h4 +# asm 1: movdqu h3=reg128#1 +# asm 2: vpxor 96(h3=%xmm0 +vpxor 96(%r8),%xmm13,%xmm0 + +# qhasm: mem128[ input_0 + 48 ] = h3 +# asm 1: movdqu h2=reg128#1 +# asm 2: vpxor 64(h2=%xmm0 +vpxor 64(%r8),%xmm9,%xmm0 + +# qhasm: mem128[ input_0 + 32 ] = h2 +# asm 1: movdqu h1=reg128#1 +# asm 2: vpxor 32(h1=%xmm0 +vpxor 32(%r8),%xmm10,%xmm0 + +# qhasm: mem128[ input_0 + 16 ] = h1 +# asm 1: movdqu h0=reg128#1 +# asm 2: vpxor 0(h0=%xmm0 +vpxor 0(%r8),%xmm11,%xmm0 + +# qhasm: mem128[ input_0 + 0 ] = h0 +# asm 1: movdqu + +typedef __m256i vec256; + +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_set1_16b(uint16_t a); +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_setzero(void); +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_set4x(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3); + +// Extract requires a literal argument so need to be macros +#define PQCLEAN_MCELIECE8192128F_AVX_vec256_extract2x(a,i) ((vec128) _mm256_extractf128_si256((vec256) (a),(i))) +#define PQCLEAN_MCELIECE8192128F_AVX_vec256_extract(a,i) ((uint64_t) _mm256_extract_epi64((vec256) (a),(i))) + +int PQCLEAN_MCELIECE8192128F_AVX_vec256_testz(vec256 a); +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_and(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_xor(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_or(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_sll_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_srl_4x(vec256 a, int s); +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_low(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_high(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_low_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_unpack_high_2x(vec256 a, vec256 b); +vec256 PQCLEAN_MCELIECE8192128F_AVX_vec256_or_reduce(const vec256 *a); +void PQCLEAN_MCELIECE8192128F_AVX_vec256_copy(vec256 *dest, const vec256 *src); + +/* bitsliced field multiplications */ +void PQCLEAN_MCELIECE8192128F_AVX_vec256_mul(vec256 *h, vec256 *f, const vec256 *g); +void PQCLEAN_MCELIECE8192128F_AVX_vec256_sq(vec256 * /*out*/, const vec256 * /*in*/); +void PQCLEAN_MCELIECE8192128F_AVX_vec256_inv(vec256 * /*out*/, const vec256 * /*in*/); + +extern void PQCLEAN_MCELIECE8192128F_AVX_vec256_maa_asm(vec256 *, vec256 *, const vec256 *); +extern void PQCLEAN_MCELIECE8192128F_AVX_vec256_ama_asm(vec256 *, vec256 *, const vec256 *); + +#endif + diff --git a/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/vec256_ama_asm.S b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/vec256_ama_asm.S new file mode 100644 index 0000000000..de644122a8 --- /dev/null +++ b/src/kem/classic_mceliece/pqclean_mceliece8192128f_avx/vec256_ama_asm.S @@ -0,0 +1,2322 @@ + +# qhasm: int64 input_0 + +# qhasm: int64 input_1 + +# qhasm: int64 input_2 + +# qhasm: int64 input_3 + +# qhasm: int64 input_4 + +# qhasm: int64 input_5 + +# qhasm: stack64 input_6 + +# qhasm: stack64 input_7 + +# qhasm: int64 caller_r11 + +# qhasm: int64 caller_r12 + +# qhasm: int64 caller_r13 + +# qhasm: int64 caller_r14 + +# qhasm: int64 caller_r15 + +# qhasm: int64 caller_rbx + +# qhasm: int64 caller_rbp + +# qhasm: reg256 a0 + +# qhasm: reg256 a1 + +# qhasm: reg256 a2 + +# qhasm: reg256 a3 + +# qhasm: reg256 a4 + +# qhasm: reg256 a5 + +# qhasm: reg256 a6 + +# qhasm: reg256 a7 + +# qhasm: reg256 a8 + +# qhasm: reg256 a9 + +# qhasm: reg256 a10 + +# qhasm: reg256 a11 + +# qhasm: reg256 a12 + +# qhasm: reg256 b0 + +# qhasm: reg256 b1 + +# qhasm: reg256 r0 + +# qhasm: reg256 r1 + +# qhasm: reg256 r2 + +# qhasm: reg256 r3 + +# qhasm: reg256 r4 + +# qhasm: reg256 r5 + +# qhasm: reg256 r6 + +# qhasm: reg256 r7 + +# qhasm: reg256 r8 + +# qhasm: reg256 r9 + +# qhasm: reg256 r10 + +# qhasm: reg256 r11 + +# qhasm: reg256 r12 + +# qhasm: reg256 r13 + +# qhasm: reg256 r14 + +# qhasm: reg256 r15 + +# qhasm: reg256 r16 + +# qhasm: reg256 r17 + +# qhasm: reg256 r18 + +# qhasm: reg256 r19 + +# qhasm: reg256 r20 + +# qhasm: reg256 r21 + +# qhasm: reg256 r22 + +# qhasm: reg256 r23 + +# qhasm: reg256 r24 + +# qhasm: reg256 r + +# qhasm: enter vec256_ama_asm +.p2align 5 +.global _PQCLEAN_MCELIECE8192128F_AVX_vec256_ama_asm +.global PQCLEAN_MCELIECE8192128F_AVX_vec256_ama_asm +_PQCLEAN_MCELIECE8192128F_AVX_vec256_ama_asm: +PQCLEAN_MCELIECE8192128F_AVX_vec256_ama_asm: +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp + +# qhasm: b0 = mem256[ input_2 + 0 ] +# asm 1: vmovupd 0(b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_0 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rdi),%ymm1 + +# qhasm: a12 = a12 ^ mem256[ input_1 + 384 ] +# asm 1: vpxor 384(a12=reg256#2 +# asm 2: vpxor 384(a12=%ymm1 +vpxor 384(%rsi),%ymm1,%ymm1 + +# qhasm: mem256[ input_0 + 384 ] = a12 +# asm 1: vmovupd r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_0 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rdi),%ymm14 + +# qhasm: a11 = a11 ^ mem256[ input_1 + 352 ] +# asm 1: vpxor 352(a11=reg256#15 +# asm 2: vpxor 352(a11=%ymm14 +vpxor 352(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 352 ] = a11 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_0 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rdi),%ymm14 + +# qhasm: a10 = a10 ^ mem256[ input_1 + 320 ] +# asm 1: vpxor 320(a10=reg256#15 +# asm 2: vpxor 320(a10=%ymm14 +vpxor 320(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 320 ] = a10 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_0 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rdi),%ymm14 + +# qhasm: a9 = a9 ^ mem256[ input_1 + 288 ] +# asm 1: vpxor 288(a9=reg256#15 +# asm 2: vpxor 288(a9=%ymm14 +vpxor 288(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 288 ] = a9 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_0 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rdi),%ymm14 + +# qhasm: a8 = a8 ^ mem256[ input_1 + 256 ] +# asm 1: vpxor 256(a8=reg256#15 +# asm 2: vpxor 256(a8=%ymm14 +vpxor 256(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 256 ] = a8 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_0 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rdi),%ymm14 + +# qhasm: a7 = a7 ^ mem256[ input_1 + 224 ] +# asm 1: vpxor 224(a7=reg256#15 +# asm 2: vpxor 224(a7=%ymm14 +vpxor 224(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 224 ] = a7 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_0 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rdi),%ymm14 + +# qhasm: a6 = a6 ^ mem256[ input_1 + 192 ] +# asm 1: vpxor 192(a6=reg256#15 +# asm 2: vpxor 192(a6=%ymm14 +vpxor 192(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 192 ] = a6 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_0 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rdi),%ymm14 + +# qhasm: a5 = a5 ^ mem256[ input_1 + 160 ] +# asm 1: vpxor 160(a5=reg256#15 +# asm 2: vpxor 160(a5=%ymm14 +vpxor 160(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 160 ] = a5 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_0 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rdi),%ymm14 + +# qhasm: a4 = a4 ^ mem256[ input_1 + 128 ] +# asm 1: vpxor 128(a4=reg256#15 +# asm 2: vpxor 128(a4=%ymm14 +vpxor 128(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 128 ] = a4 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_0 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rdi),%ymm14 + +# qhasm: a3 = a3 ^ mem256[ input_1 + 96 ] +# asm 1: vpxor 96(a3=reg256#15 +# asm 2: vpxor 96(a3=%ymm14 +vpxor 96(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 96 ] = a3 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_0 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rdi),%ymm14 + +# qhasm: a2 = a2 ^ mem256[ input_1 + 64 ] +# asm 1: vpxor 64(a2=reg256#15 +# asm 2: vpxor 64(a2=%ymm14 +vpxor 64(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 64 ] = a2 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_0 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rdi),%ymm14 + +# qhasm: a1 = a1 ^ mem256[ input_1 + 32 ] +# asm 1: vpxor 32(a1=reg256#15 +# asm 2: vpxor 32(a1=%ymm14 +vpxor 32(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 32 ] = a1 +# asm 1: vmovupd r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_0 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rdi),%ymm14 + +# qhasm: a0 = a0 ^ mem256[ input_1 + 0 ] +# asm 1: vpxor 0(a0=reg256#15 +# asm 2: vpxor 0(a0=%ymm14 +vpxor 0(%rsi),%ymm14,%ymm14 + +# qhasm: mem256[ input_0 + 0 ] = a0 +# asm 1: vmovupd r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm2,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm1,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm13,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm12,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm11,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm10,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm9,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm8,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm7,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm6,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm5,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm4,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm3,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rdi),%ymm2,%ymm0 + +# qhasm: mem256[ input_0 + 384 ] = r12 +# asm 1: vmovupd r12=reg256#1 +# asm 2: vpxor 384(r12=%ymm0 +vpxor 384(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 384 ] = r12 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rdi),%ymm1,%ymm0 + +# qhasm: mem256[ input_0 + 352 ] = r11 +# asm 1: vmovupd r11=reg256#1 +# asm 2: vpxor 352(r11=%ymm0 +vpxor 352(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 352 ] = r11 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rdi),%ymm13,%ymm0 + +# qhasm: mem256[ input_0 + 320 ] = r10 +# asm 1: vmovupd r10=reg256#1 +# asm 2: vpxor 320(r10=%ymm0 +vpxor 320(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 320 ] = r10 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rdi),%ymm12,%ymm0 + +# qhasm: mem256[ input_0 + 288 ] = r9 +# asm 1: vmovupd r9=reg256#1 +# asm 2: vpxor 288(r9=%ymm0 +vpxor 288(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 288 ] = r9 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rdi),%ymm11,%ymm0 + +# qhasm: mem256[ input_0 + 256 ] = r8 +# asm 1: vmovupd r8=reg256#1 +# asm 2: vpxor 256(r8=%ymm0 +vpxor 256(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 256 ] = r8 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rdi),%ymm10,%ymm0 + +# qhasm: mem256[ input_0 + 224 ] = r7 +# asm 1: vmovupd r7=reg256#1 +# asm 2: vpxor 224(r7=%ymm0 +vpxor 224(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 224 ] = r7 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rdi),%ymm9,%ymm0 + +# qhasm: mem256[ input_0 + 192 ] = r6 +# asm 1: vmovupd r6=reg256#1 +# asm 2: vpxor 192(r6=%ymm0 +vpxor 192(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 192 ] = r6 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rdi),%ymm8,%ymm0 + +# qhasm: mem256[ input_0 + 160 ] = r5 +# asm 1: vmovupd r5=reg256#1 +# asm 2: vpxor 160(r5=%ymm0 +vpxor 160(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 160 ] = r5 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rdi),%ymm7,%ymm0 + +# qhasm: mem256[ input_0 + 128 ] = r4 +# asm 1: vmovupd r4=reg256#1 +# asm 2: vpxor 128(r4=%ymm0 +vpxor 128(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 128 ] = r4 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rdi),%ymm6,%ymm0 + +# qhasm: mem256[ input_0 + 96 ] = r3 +# asm 1: vmovupd r3=reg256#1 +# asm 2: vpxor 96(r3=%ymm0 +vpxor 96(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 96 ] = r3 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rdi),%ymm5,%ymm0 + +# qhasm: mem256[ input_0 + 64 ] = r2 +# asm 1: vmovupd r2=reg256#1 +# asm 2: vpxor 64(r2=%ymm0 +vpxor 64(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 64 ] = r2 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rdi),%ymm4,%ymm0 + +# qhasm: mem256[ input_0 + 32 ] = r1 +# asm 1: vmovupd r1=reg256#1 +# asm 2: vpxor 32(r1=%ymm0 +vpxor 32(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 32 ] = r1 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rdi),%ymm3,%ymm0 + +# qhasm: mem256[ input_0 + 0 ] = r0 +# asm 1: vmovupd r0=reg256#1 +# asm 2: vpxor 0(r0=%ymm0 +vpxor 0(%rsi),%ymm0,%ymm0 + +# qhasm: mem256[ input_1 + 0 ] = r0 +# asm 1: vmovupd b0=reg256#1 +# asm 2: vmovupd 0(b0=%ymm0 +vmovupd 0(%rdx),%ymm0 + +# qhasm: a12 = mem256[ input_1 + 384 ] +# asm 1: vmovupd 384(a12=reg256#2 +# asm 2: vmovupd 384(a12=%ymm1 +vmovupd 384(%rsi),%ymm1 + +# qhasm: r12 = a12 & b0 +# asm 1: vpand r12=reg256#3 +# asm 2: vpand r12=%ymm2 +vpand %ymm1,%ymm0,%ymm2 + +# qhasm: r13 = a12 & mem256[input_2 + 32] +# asm 1: vpand 32(r13=reg256#4 +# asm 2: vpand 32(r13=%ymm3 +vpand 32(%rdx),%ymm1,%ymm3 + +# qhasm: r14 = a12 & mem256[input_2 + 64] +# asm 1: vpand 64(r14=reg256#5 +# asm 2: vpand 64(r14=%ymm4 +vpand 64(%rdx),%ymm1,%ymm4 + +# qhasm: r15 = a12 & mem256[input_2 + 96] +# asm 1: vpand 96(r15=reg256#6 +# asm 2: vpand 96(r15=%ymm5 +vpand 96(%rdx),%ymm1,%ymm5 + +# qhasm: r16 = a12 & mem256[input_2 + 128] +# asm 1: vpand 128(r16=reg256#7 +# asm 2: vpand 128(r16=%ymm6 +vpand 128(%rdx),%ymm1,%ymm6 + +# qhasm: r17 = a12 & mem256[input_2 + 160] +# asm 1: vpand 160(r17=reg256#8 +# asm 2: vpand 160(r17=%ymm7 +vpand 160(%rdx),%ymm1,%ymm7 + +# qhasm: r18 = a12 & mem256[input_2 + 192] +# asm 1: vpand 192(r18=reg256#9 +# asm 2: vpand 192(r18=%ymm8 +vpand 192(%rdx),%ymm1,%ymm8 + +# qhasm: r19 = a12 & mem256[input_2 + 224] +# asm 1: vpand 224(r19=reg256#10 +# asm 2: vpand 224(r19=%ymm9 +vpand 224(%rdx),%ymm1,%ymm9 + +# qhasm: r20 = a12 & mem256[input_2 + 256] +# asm 1: vpand 256(r20=reg256#11 +# asm 2: vpand 256(r20=%ymm10 +vpand 256(%rdx),%ymm1,%ymm10 + +# qhasm: r21 = a12 & mem256[input_2 + 288] +# asm 1: vpand 288(r21=reg256#12 +# asm 2: vpand 288(r21=%ymm11 +vpand 288(%rdx),%ymm1,%ymm11 + +# qhasm: r22 = a12 & mem256[input_2 + 320] +# asm 1: vpand 320(r22=reg256#13 +# asm 2: vpand 320(r22=%ymm12 +vpand 320(%rdx),%ymm1,%ymm12 + +# qhasm: r23 = a12 & mem256[input_2 + 352] +# asm 1: vpand 352(r23=reg256#14 +# asm 2: vpand 352(r23=%ymm13 +vpand 352(%rdx),%ymm1,%ymm13 + +# qhasm: r24 = a12 & mem256[input_2 + 384] +# asm 1: vpand 384(r24=reg256#2 +# asm 2: vpand 384(r24=%ymm1 +vpand 384(%rdx),%ymm1,%ymm1 + +# qhasm: r15 ^= r24 +# asm 1: vpxor r11=reg256#2 +# asm 2: vmovapd r11=%ymm1 +vmovapd %ymm1,%ymm1 + +# qhasm: a11 = mem256[ input_1 + 352 ] +# asm 1: vmovupd 352(a11=reg256#15 +# asm 2: vmovupd 352(a11=%ymm14 +vmovupd 352(%rsi),%ymm14 + +# qhasm: r = a11 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r22 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r23 ^= r +# asm 1: vpxor r10=reg256#14 +# asm 2: vmovapd r10=%ymm13 +vmovapd %ymm13,%ymm13 + +# qhasm: a10 = mem256[ input_1 + 320 ] +# asm 1: vmovupd 320(a10=reg256#15 +# asm 2: vmovupd 320(a10=%ymm14 +vmovupd 320(%rsi),%ymm14 + +# qhasm: r = a10 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r21 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r22 ^= r +# asm 1: vpxor r9=reg256#13 +# asm 2: vmovapd r9=%ymm12 +vmovapd %ymm12,%ymm12 + +# qhasm: a9 = mem256[ input_1 + 288 ] +# asm 1: vmovupd 288(a9=reg256#15 +# asm 2: vmovupd 288(a9=%ymm14 +vmovupd 288(%rsi),%ymm14 + +# qhasm: r = a9 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r20 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r21 ^= r +# asm 1: vpxor r8=reg256#12 +# asm 2: vmovapd r8=%ymm11 +vmovapd %ymm11,%ymm11 + +# qhasm: a8 = mem256[ input_1 + 256 ] +# asm 1: vmovupd 256(a8=reg256#15 +# asm 2: vmovupd 256(a8=%ymm14 +vmovupd 256(%rsi),%ymm14 + +# qhasm: r = a8 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r19 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r20 ^= r +# asm 1: vpxor r7=reg256#11 +# asm 2: vmovapd r7=%ymm10 +vmovapd %ymm10,%ymm10 + +# qhasm: a7 = mem256[ input_1 + 224 ] +# asm 1: vmovupd 224(a7=reg256#15 +# asm 2: vmovupd 224(a7=%ymm14 +vmovupd 224(%rsi),%ymm14 + +# qhasm: r = a7 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r18 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r19 ^= r +# asm 1: vpxor r6=reg256#10 +# asm 2: vmovapd r6=%ymm9 +vmovapd %ymm9,%ymm9 + +# qhasm: a6 = mem256[ input_1 + 192 ] +# asm 1: vmovupd 192(a6=reg256#15 +# asm 2: vmovupd 192(a6=%ymm14 +vmovupd 192(%rsi),%ymm14 + +# qhasm: r = a6 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r17 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r18 ^= r +# asm 1: vpxor r5=reg256#9 +# asm 2: vmovapd r5=%ymm8 +vmovapd %ymm8,%ymm8 + +# qhasm: a5 = mem256[ input_1 + 160 ] +# asm 1: vmovupd 160(a5=reg256#15 +# asm 2: vmovupd 160(a5=%ymm14 +vmovupd 160(%rsi),%ymm14 + +# qhasm: r = a5 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r16 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r17 ^= r +# asm 1: vpxor r4=reg256#8 +# asm 2: vmovapd r4=%ymm7 +vmovapd %ymm7,%ymm7 + +# qhasm: a4 = mem256[ input_1 + 128 ] +# asm 1: vmovupd 128(a4=reg256#15 +# asm 2: vmovupd 128(a4=%ymm14 +vmovupd 128(%rsi),%ymm14 + +# qhasm: r = a4 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r15 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r16 ^= r +# asm 1: vpxor r3=reg256#7 +# asm 2: vmovapd r3=%ymm6 +vmovapd %ymm6,%ymm6 + +# qhasm: a3 = mem256[ input_1 + 96 ] +# asm 1: vmovupd 96(a3=reg256#15 +# asm 2: vmovupd 96(a3=%ymm14 +vmovupd 96(%rsi),%ymm14 + +# qhasm: r = a3 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r14 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r15 ^= r +# asm 1: vpxor r2=reg256#6 +# asm 2: vmovapd r2=%ymm5 +vmovapd %ymm5,%ymm5 + +# qhasm: a2 = mem256[ input_1 + 64 ] +# asm 1: vmovupd 64(a2=reg256#15 +# asm 2: vmovupd 64(a2=%ymm14 +vmovupd 64(%rsi),%ymm14 + +# qhasm: r = a2 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r13 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r14 ^= r +# asm 1: vpxor r1=reg256#5 +# asm 2: vmovapd r1=%ymm4 +vmovapd %ymm4,%ymm4 + +# qhasm: a1 = mem256[ input_1 + 32 ] +# asm 1: vmovupd 32(a1=reg256#15 +# asm 2: vmovupd 32(a1=%ymm14 +vmovupd 32(%rsi),%ymm14 + +# qhasm: r = a1 & b0 +# asm 1: vpand r=reg256#16 +# asm 2: vpand r=%ymm15 +vpand %ymm14,%ymm0,%ymm15 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 32(r=%ymm15 +vpand 32(%rdx),%ymm14,%ymm15 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 64(r=%ymm15 +vpand 64(%rdx),%ymm14,%ymm15 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 96(r=%ymm15 +vpand 96(%rdx),%ymm14,%ymm15 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 128(r=%ymm15 +vpand 128(%rdx),%ymm14,%ymm15 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 160(r=%ymm15 +vpand 160(%rdx),%ymm14,%ymm15 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 192(r=%ymm15 +vpand 192(%rdx),%ymm14,%ymm15 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 224(r=%ymm15 +vpand 224(%rdx),%ymm14,%ymm15 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 256(r=%ymm15 +vpand 256(%rdx),%ymm14,%ymm15 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 288(r=%ymm15 +vpand 288(%rdx),%ymm14,%ymm15 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 320(r=%ymm15 +vpand 320(%rdx),%ymm14,%ymm15 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#16 +# asm 2: vpand 352(r=%ymm15 +vpand 352(%rdx),%ymm14,%ymm15 + +# qhasm: r12 ^= r +# asm 1: vpxor r=reg256#15 +# asm 2: vpand 384(r=%ymm14 +vpand 384(%rdx),%ymm14,%ymm14 + +# qhasm: r13 ^= r +# asm 1: vpxor r0=reg256#4 +# asm 2: vmovapd r0=%ymm3 +vmovapd %ymm3,%ymm3 + +# qhasm: a0 = mem256[ input_1 + 0 ] +# asm 1: vmovupd 0(a0=reg256#15 +# asm 2: vmovupd 0(a0=%ymm14 +vmovupd 0(%rsi),%ymm14 + +# qhasm: r = a0 & b0 +# asm 1: vpand r=reg256#1 +# asm 2: vpand r=%ymm0 +vpand %ymm14,%ymm0,%ymm0 + +# qhasm: r0 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 32(r=%ymm0 +vpand 32(%rdx),%ymm14,%ymm0 + +# qhasm: r1 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 64(r=%ymm0 +vpand 64(%rdx),%ymm14,%ymm0 + +# qhasm: r2 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 96(r=%ymm0 +vpand 96(%rdx),%ymm14,%ymm0 + +# qhasm: r3 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 128(r=%ymm0 +vpand 128(%rdx),%ymm14,%ymm0 + +# qhasm: r4 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 160(r=%ymm0 +vpand 160(%rdx),%ymm14,%ymm0 + +# qhasm: r5 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 192(r=%ymm0 +vpand 192(%rdx),%ymm14,%ymm0 + +# qhasm: r6 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 224(r=%ymm0 +vpand 224(%rdx),%ymm14,%ymm0 + +# qhasm: r7 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 256(r=%ymm0 +vpand 256(%rdx),%ymm14,%ymm0 + +# qhasm: r8 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 288(r=%ymm0 +vpand 288(%rdx),%ymm14,%ymm0 + +# qhasm: r9 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 320(r=%ymm0 +vpand 320(%rdx),%ymm14,%ymm0 + +# qhasm: r10 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 352(r=%ymm0 +vpand 352(%rdx),%ymm14,%ymm0 + +# qhasm: r11 ^= r +# asm 1: vpxor r=reg256#1 +# asm 2: vpand 384(r=%ymm0 +vpand 384(%rdx),%ymm14,%ymm0 + +# qhasm: r12 ^= r +# asm 1: vpxor r=int64#7 +# asm 2: mov $0,>r=%rax +mov $0,%rax + +# qhasm: t0 = mem64[ input_0 + 192 ] +# asm 1: movq 192(t0=int64#2 +# asm 2: movq 192(t0=%rsi +movq 192(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 200 ] +# asm 1: movq 200(t1=int64#3 +# asm 2: movq 200(t1=%rdx +movq 200(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 176(t0=%rsi +movq 176(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 184 ] +# asm 1: movq 184(t1=int64#3 +# asm 2: movq 184(t1=%rdx +movq 184(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 160(t0=%rsi +movq 160(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 168 ] +# asm 1: movq 168(t1=int64#3 +# asm 2: movq 168(t1=%rdx +movq 168(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 144(t0=%rsi +movq 144(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 152 ] +# asm 1: movq 152(t1=int64#3 +# asm 2: movq 152(t1=%rdx +movq 152(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 128(t0=%rsi +movq 128(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 136 ] +# asm 1: movq 136(t1=int64#3 +# asm 2: movq 136(t1=%rdx +movq 136(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 112(t0=%rsi +movq 112(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 120 ] +# asm 1: movq 120(t1=int64#3 +# asm 2: movq 120(t1=%rdx +movq 120(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 96(t0=%rsi +movq 96(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 104 ] +# asm 1: movq 104(t1=int64#3 +# asm 2: movq 104(t1=%rdx +movq 104(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 80(t0=%rsi +movq 80(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 88 ] +# asm 1: movq 88(t1=int64#3 +# asm 2: movq 88(t1=%rdx +movq 88(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 64(t0=%rsi +movq 64(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 72 ] +# asm 1: movq 72(t1=int64#3 +# asm 2: movq 72(t1=%rdx +movq 72(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 48(t0=%rsi +movq 48(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 56 ] +# asm 1: movq 56(t1=int64#3 +# asm 2: movq 56(t1=%rdx +movq 56(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 32(t0=%rsi +movq 32(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 40 ] +# asm 1: movq 40(t1=int64#3 +# asm 2: movq 40(t1=%rdx +movq 40(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 16(t0=%rsi +movq 16(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 24 ] +# asm 1: movq 24(t1=int64#3 +# asm 2: movq 24(t1=%rdx +movq 24(%rdi),%rdx + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#2 +# asm 2: popcnt c=%rsi +popcnt %rsi, %rsi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,t0=int64#2 +# asm 2: movq 0(t0=%rsi +movq 0(%rdi),%rsi + +# qhasm: t1 = mem64[ input_0 + 8 ] +# asm 1: movq 8(t1=int64#1 +# asm 2: movq 8(t1=%rdi +movq 8(%rdi),%rdi + +# qhasm: t0 ^= t1 +# asm 1: xor c=int64#1 +# asm 2: popcnt c=%rdi +popcnt %rsi, %rdi + +# qhasm: (uint32) c &= 1 +# asm 1: and $1,method_name = OQS_SIG_alg_falcon_1024; - sig->alg_version = "supercop-20201018 via https://github.com/jschanck/package-pqclean/tree/78831f03/falcon"; + sig->alg_version = "supercop-20201018 via https://github.com/jschanck/package-pqclean/tree/cea1fa5a/falcon"; sig->claimed_nist_level = 5; sig->euf_cma = true; diff --git a/src/sig/falcon/sig_falcon_512.c b/src/sig/falcon/sig_falcon_512.c index e6831451ea..d23684abe5 100644 --- a/src/sig/falcon/sig_falcon_512.c +++ b/src/sig/falcon/sig_falcon_512.c @@ -13,7 +13,7 @@ OQS_SIG *OQS_SIG_falcon_512_new() { return NULL; } sig->method_name = OQS_SIG_alg_falcon_512; - sig->alg_version = "supercop-20201018 via https://github.com/jschanck/package-pqclean/tree/78831f03/falcon"; + sig->alg_version = "supercop-20201018 via https://github.com/jschanck/package-pqclean/tree/cea1fa5a/falcon"; sig->claimed_nist_level = 1; sig->euf_cma = true; diff --git a/tests/KATs/sig/kats.json b/tests/KATs/sig/kats.json index ccbc8654cf..de3c7f112d 100644 --- a/tests/KATs/sig/kats.json +++ b/tests/KATs/sig/kats.json @@ -64,4 +64,4 @@ "picnic_L5_FS": "f8af8003cca1cca9c7fb56ec4dcb397b6ca13ab585cb88cc89edd33d1572b088", "picnic_L5_UR": "a0366dfc37debef213619d5491d47e232fae27980669be7433ba859638f63015", "picnic_L5_full": "839f72163492e1fd80a2923842cb269e13f60cdc023e0e2bd3533e54d88fe4f8" -} +} \ No newline at end of file From 50167f2570f1e630d07c66dc2915a57cd703e9fb Mon Sep 17 00:00:00 2001 From: Michael <57787676+baentsch@users.noreply.github.com> Date: Mon, 22 Feb 2021 16:51:21 +0100 Subject: [PATCH 3/6] fixing compiler options for McElieceAVX --- .CMake/compiler_opts.cmake | 1 + .circleci/config.yml | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.CMake/compiler_opts.cmake b/.CMake/compiler_opts.cmake index 44d8d44691..8ba83abae0 100644 --- a/.CMake/compiler_opts.cmake +++ b/.CMake/compiler_opts.cmake @@ -25,6 +25,7 @@ if(CMAKE_C_COMPILER_ID MATCHES "Clang") add_compile_options(-fno-optimize-sibling-calls) add_compile_options(-fsanitize-address-use-after-scope) add_compile_options(-fsanitize=address) + add_compile_options(-Wno-language-extension-token) set(SANITIZER_LD_FLAGS "-fsanitize=address") elseif(USE_SANITIZER STREQUAL "Memory") add_compile_options(-fsanitize=address) diff --git a/.circleci/config.yml b/.circleci/config.yml index d7c06d548c..19e78ab73d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -263,7 +263,7 @@ workflows: name: alpine context: openquantumsafe CONTAINER: openquantumsafe/ci-alpine-amd64:latest - CMAKE_ARGS: -DCMAKE_BUILD_TYPE=Release -DOQS_USE_OPENSSL=ON -DBUILD_SHARED_LIBS=ON + CMAKE_ARGS: -DCMAKE_BUILD_TYPE=Release -DOQS_USE_OPENSSL=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON # Disabling centos-8 and debian-buster. # Re-enable if specific configurations (package versions etc) that need to be tested are identified. #- linux_x64: @@ -288,7 +288,7 @@ workflows: name: ubuntu-focal-shared-noopenssl context: openquantumsafe CONTAINER: openquantumsafe/ci-ubuntu-focal-x86_64:latest - CMAKE_ARGS: -DCMAKE_C_COMPILER=gcc-7 -DCMAKE_BUILD_TYPE=Release -DOQS_USE_OPENSSL=OFF -DBUILD_SHARED_LIBS=ON + CMAKE_ARGS: -DCMAKE_C_COMPILER=gcc-7 -DCMAKE_BUILD_TYPE=Release -DOQS_USE_OPENSSL=OFF -DBUILD_SHARED_LIBS=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON PYTEST_ARGS: --ignore=tests/test_namespace.py --numprocesses=auto - linux_x64: <<: *require_buildcheck From 58e3fe873208e9e59b57eea7ee306df43cfbbc87 Mon Sep 17 00:00:00 2001 From: Michael <57787676+baentsch@users.noreply.github.com> Date: Tue, 23 Feb 2021 07:09:54 +0100 Subject: [PATCH 4/6] getting mceliece avx shared to link --- .circleci/config.yml | 4 ++-- src/CMakeLists.txt | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 19e78ab73d..d7c06d548c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -263,7 +263,7 @@ workflows: name: alpine context: openquantumsafe CONTAINER: openquantumsafe/ci-alpine-amd64:latest - CMAKE_ARGS: -DCMAKE_BUILD_TYPE=Release -DOQS_USE_OPENSSL=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON + CMAKE_ARGS: -DCMAKE_BUILD_TYPE=Release -DOQS_USE_OPENSSL=ON -DBUILD_SHARED_LIBS=ON # Disabling centos-8 and debian-buster. # Re-enable if specific configurations (package versions etc) that need to be tested are identified. #- linux_x64: @@ -288,7 +288,7 @@ workflows: name: ubuntu-focal-shared-noopenssl context: openquantumsafe CONTAINER: openquantumsafe/ci-ubuntu-focal-x86_64:latest - CMAKE_ARGS: -DCMAKE_C_COMPILER=gcc-7 -DCMAKE_BUILD_TYPE=Release -DOQS_USE_OPENSSL=OFF -DBUILD_SHARED_LIBS=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON + CMAKE_ARGS: -DCMAKE_C_COMPILER=gcc-7 -DCMAKE_BUILD_TYPE=Release -DOQS_USE_OPENSSL=OFF -DBUILD_SHARED_LIBS=ON PYTEST_ARGS: --ignore=tests/test_namespace.py --numprocesses=auto - linux_x64: <<: *require_buildcheck diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a2ffd4a364..301cd225ad 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -77,6 +77,10 @@ if(OQS_USE_OPENSSL) target_link_libraries(oqs PUBLIC ${OPENSSL_CRYPTO_LIBRARY}) target_include_directories(oqs PUBLIC ${OPENSSL_INCLUDE_DIR}) endif() +if(OQS_ENABLE_KEM_CLASSIC_MCELIECE) + # hack to enable AVX symbols to link +target_link_options(oqs PRIVATE -Wl,-Bsymbolic) +endif() set_target_properties(oqs PROPERTIES From 78e37cff7c2cff9fbf47f3bfa160be9e7fa0d259 Mon Sep 17 00:00:00 2001 From: Michael <57787676+baentsch@users.noreply.github.com> Date: Tue, 23 Feb 2021 10:31:06 +0100 Subject: [PATCH 5/6] another mceliece avx try --- .CMake/compiler_opts.cmake | 3 ++- src/CMakeLists.txt | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.CMake/compiler_opts.cmake b/.CMake/compiler_opts.cmake index 8ba83abae0..4e946b4e6e 100644 --- a/.CMake/compiler_opts.cmake +++ b/.CMake/compiler_opts.cmake @@ -16,6 +16,8 @@ if(CMAKE_C_COMPILER_ID MATCHES "Clang") option(OQS_USE_CPU_EXTENSIONS "Enable compile and run-time support for CPU extensions such as AVX2, SSE, etc." ON) if(OQS_USE_CPU_EXTENSIONS) include(${CMAKE_CURRENT_LIST_DIR}/gcc_clang_intrinsics.cmake) + # to allow McEliece AVX code to compile with clang9: + add_compile_options(-Wno-language-extension-token) endif() if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") @@ -25,7 +27,6 @@ if(CMAKE_C_COMPILER_ID MATCHES "Clang") add_compile_options(-fno-optimize-sibling-calls) add_compile_options(-fsanitize-address-use-after-scope) add_compile_options(-fsanitize=address) - add_compile_options(-Wno-language-extension-token) set(SANITIZER_LD_FLAGS "-fsanitize=address") elseif(USE_SANITIZER STREQUAL "Memory") add_compile_options(-fsanitize=address) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 301cd225ad..ed96b647b4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -77,8 +77,8 @@ if(OQS_USE_OPENSSL) target_link_libraries(oqs PUBLIC ${OPENSSL_CRYPTO_LIBRARY}) target_include_directories(oqs PUBLIC ${OPENSSL_INCLUDE_DIR}) endif() -if(OQS_ENABLE_KEM_CLASSIC_MCELIECE) - # hack to enable AVX symbols to link +if (CMAKE_SYSTEM_NAME MATCHES "Linux" AND OQS_USE_AVX2_INSTRUCTIONS AND OQS_ENABLE_KEM_CLASSIC_MCELIECE) + # hack to enable McEliece' AVX symbols to link on Linux target_link_options(oqs PRIVATE -Wl,-Bsymbolic) endif() From a7d2d3d412b28fc65f5c637fecef316ed95458e7 Mon Sep 17 00:00:00 2001 From: Michael <57787676+baentsch@users.noreply.github.com> Date: Tue, 23 Feb 2021 15:18:01 +0100 Subject: [PATCH 6/6] alg doc added [skip ci] --- docs/algorithms/kem/classic_mceliece.md | 6 +++--- docs/algorithms/sig/falcon.md | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/algorithms/kem/classic_mceliece.md b/docs/algorithms/kem/classic_mceliece.md index fdd75738f8..b3bd378ca3 100644 --- a/docs/algorithms/kem/classic_mceliece.md +++ b/docs/algorithms/kem/classic_mceliece.md @@ -10,11 +10,11 @@ Classic McEliece Implementation -------------- -- **Source of implementation**: SUPERCOP-20191221, "vec" implementation -- **Implementation version**: https://github.com/PQClean/PQClean/commit/3c8be3cb1f8ca0953f2df44ea665f118efb468d6 +- **Source of implementation**: SUPERCOP-20191221, "vec" implementation ("avx" implementation if OQS_USE_CPU_EXTENSIONS set) +- **Implementation version**: https://github.com/PQClean/PQClean/commit/ebcc71c51a30b6e5db4f1fade22999b346fdafce - **License**: Public domain - **Constant-time**: Yes -- **Optimizations**: Portable C +- **Optimizations**: Portable C, wth AVX2, POPCNT, BMI1 instructions (if available at run-time) Parameter sets -------------- diff --git a/docs/algorithms/sig/falcon.md b/docs/algorithms/sig/falcon.md index 269b56ac82..1b2dec04c2 100644 --- a/docs/algorithms/sig/falcon.md +++ b/docs/algorithms/sig/falcon.md @@ -10,8 +10,8 @@ Falcon Implementation -------------- -- **Source of implementation**: supercop-20201018 via https://github.com/jschanck/package-pqclean/tree/78831f03/falcon -- **Implementation version**: https://github.com/PQClean/PQClean/commit/3c8be3cb1f8ca0953f2df44ea665f118efb468d6 +- **Source of implementation**: supercop-20201018 via https://github.com/jschanck/package-pqclean/tree/cea1fa5a/falcon +- **Implementation version**: https://github.com/PQClean/PQClean/commit/ebcc71c51a30b6e5db4f1fade22999b346fdafce - **License**: CC0 1.0 Universal - **Constant-time**: Yes - **Optimizations**: Portable C, with AVX2 instructions (if available at runtime)