11//
2- // Copyright (c) 2011, 2020 , Oracle and/or its affiliates. All rights reserved.
2+ // Copyright (c) 2011, 2021 , Oracle and/or its affiliates. All rights reserved.
33// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44//
55// This code is free software; you can redistribute it and/or modify it
@@ -1354,6 +1354,7 @@ Assembler::Width widthForType(BasicType bt) {
13541354 static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
13551355 static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
13561356 static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
1357+ static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
13571358 static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
13581359 static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
13591360 static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
@@ -1691,9 +1692,9 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
16911692 return false; // Implementation limitation due to how shuffle is loaded
16921693 } else if (size_in_bits == 256 && UseAVX < 2) {
16931694 return false; // Implementation limitation
1694- } else if (bt == T_BYTE && size_in_bits >= 256 && !VM_Version::supports_avx512_vbmi()) {
1695+ } else if (bt == T_BYTE && size_in_bits > 256 && !VM_Version::supports_avx512_vbmi()) {
16951696 return false; // Implementation limitation
1696- } else if (bt == T_SHORT && size_in_bits >= 256 && !VM_Version::supports_avx512bw()) {
1697+ } else if (bt == T_SHORT && size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
16971698 return false; // Implementation limitation
16981699 }
16991700 break;
@@ -7500,13 +7501,24 @@ instruct rearrangeB(vec dst, vec shuffle) %{
75007501 ins_pipe( pipe_slow );
75017502%}
75027503
7503- instruct rearrangeB_avx(vec dst, vec src, vec shuffle) %{
7504+ instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch ) %{
75047505 predicate(vector_element_basic_type(n) == T_BYTE &&
75057506 vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
75067507 match(Set dst (VectorRearrange src shuffle));
7507- format %{ "vector_rearrange $dst, $shuffle, $src" %}
7508+ effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
7509+ format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
75087510 ins_encode %{
7509- __ vpshufb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, Assembler::AVX_256bit);
7511+ assert(UseAVX >= 2, "required");
7512+ // Swap src into vtmp1
7513+ __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
7514+ // Shuffle swapped src to get entries from other 128 bit lane
7515+ __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
7516+ // Shuffle original src to get entries from self 128 bit lane
7517+ __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
7518+ // Create a blend mask by setting high bits for entries coming from other lane in shuffle
7519+ __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
7520+ // Perform the blend
7521+ __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
75107522 %}
75117523 ins_pipe( pipe_slow );
75127524%}
@@ -7527,26 +7539,40 @@ instruct rearrangeB_evex(vec dst, vec src, vec shuffle) %{
75277539
75287540instruct loadShuffleS(vec dst, vec src, vec vtmp, rRegP scratch) %{
75297541 predicate(vector_element_basic_type(n) == T_SHORT &&
7530- vector_length(n) <= 8 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
7542+ vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
75317543 match(Set dst (VectorLoadShuffle src));
75327544 effect(TEMP dst, TEMP vtmp, TEMP scratch);
75337545 format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
75347546 ins_encode %{
75357547 // Create a byte shuffle mask from short shuffle mask
75367548 // only byte shuffle instruction available on these platforms
7549+ int vlen_in_bytes = vector_length_in_bytes(this);
7550+ if (vlen_in_bytes <= 8) {
7551+ // Multiply each shuffle by two to get byte index
7552+ __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
7553+ __ psllw($vtmp$$XMMRegister, 1);
7554+
7555+ // Duplicate to create 2 copies of byte index
7556+ __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
7557+ __ psllw($dst$$XMMRegister, 8);
7558+ __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
7559+
7560+ // Add one to get alternate byte index
7561+ __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), $scratch$$Register);
7562+ __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
7563+ } else {
7564+ int vlen_enc = vector_length_encoding(this);
7565+ // Multiply each shuffle by two to get byte index
7566+ __ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
7567+ __ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
75377568
7538- // Multiply each shuffle by two to get byte index
7539- __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
7540- __ psllw($vtmp$$XMMRegister, 1);
7541-
7542- // Duplicate to create 2 copies of byte index
7543- __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
7544- __ psllw($dst$$XMMRegister, 8);
7545- __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
7569+ // Duplicate to create 2 copies of byte index
7570+ __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister, 8, vlen_enc);
7571+ __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
75467572
7547- // Add one to get alternate byte index
7548- __ movdqu($vtmp $$XMMRegister, ExternalAddress(vector_short_shufflemask()), $scratch$$Register);
7549- __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
7573+ // Add one to get alternate byte index
7574+ __ vpaddb($dst $$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc , $scratch$$Register);
7575+ }
75507576 %}
75517577 ins_pipe( pipe_slow );
75527578%}
@@ -7563,6 +7589,28 @@ instruct rearrangeS(vec dst, vec shuffle) %{
75637589 ins_pipe( pipe_slow );
75647590%}
75657591
7592+ instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
7593+ predicate(vector_element_basic_type(n) == T_SHORT &&
7594+ vector_length(n) == 16 && !VM_Version::supports_avx512bw());
7595+ match(Set dst (VectorRearrange src shuffle));
7596+ effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
7597+ format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
7598+ ins_encode %{
7599+ assert(UseAVX >= 2, "required");
7600+ // Swap src into vtmp1
7601+ __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
7602+ // Shuffle swapped src to get entries from other 128 bit lane
7603+ __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
7604+ // Shuffle original src to get entries from self 128 bit lane
7605+ __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
7606+ // Create a blend mask by setting high bits for entries coming from other lane in shuffle
7607+ __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
7608+ // Perform the blend
7609+ __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
7610+ %}
7611+ ins_pipe( pipe_slow );
7612+ %}
7613+
75667614instruct loadShuffleS_evex(vec dst, vec src) %{
75677615 predicate(vector_element_basic_type(n) == T_SHORT &&
75687616 VM_Version::supports_avx512bw());
0 commit comments