@@ -2203,114 +2203,117 @@ void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t l
22032203//  Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
22042204//  Any remaining elements of dst will be filled with zero.
22052205//  Clobbers: rscratch1
2206- //  Preserves: src, mask 
2206+ //  Preserves: mask, vzr 
22072207void  C2_MacroAssembler::sve_compress_short (FloatRegister dst, FloatRegister src, PRegister mask,
2208-                                            FloatRegister vtmp1 , FloatRegister vtmp2 ,
2209-                                            PRegister pgtmp) {
2208+                                            FloatRegister vzr , FloatRegister vtmp ,
2209+                                            PRegister pgtmp,  unsigned  vector_length_in_bytes ) {
22102210  assert (pgtmp->is_governing (), " This register has to be a governing predicate register" 
2211-   assert_different_registers (dst, src, vtmp1, vtmp2);
2211+   //  When called by sve_compress_byte, src and vtmp may be the same register.
2212+   assert_different_registers (dst, src, vzr);
2213+   assert_different_registers (dst, vtmp, vzr);
22122214  assert_different_registers (mask, pgtmp);
2213- 
2214-   //  Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
2215-   //                   mask  = 0001 0000 0000 0001 0001 0000 0001 0001
2216-   //  Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
2217-   sve_dup (vtmp2, H, 0 );
2215+   //  high <-- low
2216+   //  Example input:   src   = hh gg ff ee dd cc bb aa, one character is 8 bits.
2217+   //                   mask  = 01 00 00 01 01 00 01 01, one character is 1 bit.
2218+   //  Expected result: dst   = 00 00 00 hh ee dd bb aa
22182219
22192220  //  Extend lowest half to type INT.
2220-   //  dst = 00004444 00003333 00002222 00001111 
2221+   //  dst   =  00dd  00cc  00bb  00aa 
22212222  sve_uunpklo (dst, S, src);
2222-   //  pgtmp = 00000001 00000000 00000001 00000001 
2223+   //  pgtmp =  0001  0000  0001  0001 
22232224  sve_punpklo (pgtmp, mask);
22242225  //  Pack the active elements in size of type INT to the right,
22252226  //  and fill the remainings with zero.
2226-   //  dst = 00000000 00004444 00002222 00001111 
2227+   //  dst   =  0000  00dd  00bb  00aa 
22272228  sve_compact (dst, S, dst, pgtmp);
22282229  //  Narrow the result back to type SHORT.
2229-   //  dst = 0000 0000 0000 0000 0000 4444 2222 1111
2230-   sve_uzp1 (dst, H, dst, vtmp2);
2230+   //  dst   = 00 00 00 00 00 dd bb aa
2231+   sve_uzp1 (dst, H, dst, vzr);
2232+ 
2233+   //  Return if the vector length is no more than MaxVectorSize/2, since the
2234+   //  highest half is invalid.
2235+   if  (vector_length_in_bytes <= (MaxVectorSize >> 1 )) {
2236+     return ;
2237+   }
2238+ 
22312239  //  Count the active elements of lowest half.
22322240  //  rscratch1 = 3
22332241  sve_cntp (rscratch1, S, ptrue, pgtmp);
22342242
22352243  //  Repeat to the highest half.
2236-   //  pgtmp = 00000001 00000000 00000000 00000001 
2244+   //  pgtmp =  0001  0000  0000  0001 
22372245  sve_punpkhi (pgtmp, mask);
2238-   //  vtmp1 = 00008888 00007777 00006666 00005555
2239-   sve_uunpkhi (vtmp1, S, src);
2240-   //  vtmp1 = 00000000 00000000 00008888 00005555
2241-   sve_compact (vtmp1, S, vtmp1, pgtmp);
2242-   //  vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2243-   sve_uzp1 (vtmp1, H, vtmp1, vtmp2);
2244- 
2245-   //  Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2246-   //  Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2247-   //  Left shift(cross lane) compressed high with TRUE_CNT lanes,
2248-   //  TRUE_CNT is the number of active elements in the compressed low.
2249-   neg (rscratch1, rscratch1);
2250-   //  vtmp2 = {4 3 2 1 0 -1 -2 -3}
2251-   sve_index (vtmp2, H, rscratch1, 1 );
2252-   //  vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2253-   sve_tbl (vtmp1, H, vtmp1, vtmp2);
2254- 
2255-   //  Combine the compressed high(after shifted) with the compressed low.
2256-   //  dst = 0000 0000 0000 8888 5555 4444 2222 1111
2257-   sve_orr (dst, dst, vtmp1);
2246+   //  vtmp  =  00hh  00gg  00ff  00ee
2247+   sve_uunpkhi (vtmp, S, src);
2248+   //  vtmp  =  0000  0000  00hh  00ee
2249+   sve_compact (vtmp, S, vtmp, pgtmp);
2250+   //  vtmp  = 00 00 00 00 00 00 hh ee
2251+   sve_uzp1 (vtmp, H, vtmp, vzr);
2252+ 
2253+   //  pgtmp = 00 00 00 00 00 01 01 01
2254+   sve_whilelt (pgtmp, H, zr, rscratch1);
2255+   //  Compressed low:  dst  = 00 00 00 00 00 dd bb aa
2256+   //  Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2257+   //  Combine the compressed low with the compressed high:
2258+   //                   dst  = 00 00 00 hh ee dd bb aa
2259+   sve_splice (dst, H, pgtmp, vtmp);
22582260}
22592261
22602262//  Clobbers: rscratch1, rscratch2
22612263//  Preserves: src, mask
22622264void  C2_MacroAssembler::sve_compress_byte (FloatRegister dst, FloatRegister src, PRegister mask,
2263-                                           FloatRegister vtmp1, FloatRegister vtmp2,
2264-                                           FloatRegister vtmp3, FloatRegister vtmp4,
2265-                                           PRegister ptmp, PRegister pgtmp) {
2265+                                           FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2266+                                           PRegister ptmp, PRegister pgtmp, unsigned  vector_length_in_bytes) {
22662267  assert (pgtmp->is_governing (), " This register has to be a governing predicate register" 
2267-   assert_different_registers (dst, src, vtmp1, vtmp2, vtmp3, vtmp4 );
2268+   assert_different_registers (dst, src, vtmp1, vtmp2, vtmp3);
22682269  assert_different_registers (mask, ptmp, pgtmp);
2269-   //  Example input:   src   = 88 77 66 55 44 33 22 11
2270-   //                   mask  = 01 00 00 01 01 00 01 01
2271-   //  Expected result: dst   = 00 00 00 88 55 44 22 11
2270+   //  high <-- low
2271+   //  Example input:   src   = q p n m l k j i h g f e d c b a, one character is 8 bits.
2272+   //                   mask  = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2273+   //  Expected result: dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2274+   FloatRegister vzr = vtmp3;
2275+   sve_dup (vzr, B, 0 );
22722276
2273-   sve_dup (vtmp4, B, 0 );
22742277  //  Extend lowest half to type SHORT.
2275-   //  vtmp1 = 0044 0033 0022 0011 
2278+   //  vtmp1 =  0h  0g  0f  0e  0d  0c  0b  0a 
22762279  sve_uunpklo (vtmp1, H, src);
2277-   //  ptmp = 0001 0000 0001 0001 
2280+   //  ptmp  =  00  01  00  00  00  01  00  01 
22782281  sve_punpklo (ptmp, mask);
2279-   //  Count the active elements of lowest half.
2280-   //  rscratch2 = 3
2281-   sve_cntp (rscratch2, H, ptrue, ptmp);
22822282  //  Pack the active elements in size of type SHORT to the right,
22832283  //  and fill the remainings with zero.
2284-   //  dst = 0000 0044 0022 0011
2285-   sve_compress_short (dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2284+   //  dst   =  00  00  00  00  00  0g  0c  0a
2285+   unsigned  extended_size = vector_length_in_bytes << 1 ;
2286+   sve_compress_short (dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
22862287  //  Narrow the result back to type BYTE.
2287-   //  dst = 00 00 00 00 00 44 22 11
2288-   sve_uzp1 (dst, B, dst, vtmp4);
2288+   //  dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2289+   sve_uzp1 (dst, B, dst, vzr);
2290+ 
2291+   //  Return if the vector length is no more than MaxVectorSize/2, since the
2292+   //  highest half is invalid.
2293+   if  (vector_length_in_bytes <= (MaxVectorSize >> 1 )) {
2294+     return ;
2295+   }
2296+   //  Count the active elements of lowest half.
2297+   //  rscratch2 = 3
2298+   sve_cntp (rscratch2, H, ptrue, ptmp);
22892299
22902300  //  Repeat to the highest half.
2291-   //  ptmp = 0001 0000 0000 0001 
2301+   //  ptmp  =  00  01  00  00  00  00  00  01 
22922302  sve_punpkhi (ptmp, mask);
2293-   //  vtmp1  = 0088 0077 0066 0055 
2303+   //  vtmp2  =  0q  0p  0n  0m  0l  0k  0j  0i 
22942304  sve_uunpkhi (vtmp2, H, src);
2295-   //  vtmp1 = 0000 0000 0088 0055
2296-   sve_compress_short (vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2297- 
2298-   sve_dup (vtmp4, B, 0 );
2299-   //  vtmp1 = 00 00 00 00 00 00 88 55
2300-   sve_uzp1 (vtmp1, B, vtmp1, vtmp4);
2301- 
2302-   //  Compressed low:   dst   = 00 00 00 00 00 44 22 11
2303-   //  Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2304-   //  Left shift(cross lane) compressed high with TRUE_CNT lanes,
2305-   //  TRUE_CNT is the number of active elements in the compressed low.
2306-   neg (rscratch2, rscratch2);
2307-   //  vtmp2 = {4 3 2 1 0 -1 -2 -3}
2308-   sve_index (vtmp2, B, rscratch2, 1 );
2309-   //  vtmp1 = 00 00 00 88 55 00 00 00
2310-   sve_tbl (vtmp1, B, vtmp1, vtmp2);
2311-   //  Combine the compressed high(after shifted) with the compressed low.
2312-   //  dst = 00 00 00 88 55 44 22 11
2313-   sve_orr (dst, dst, vtmp1);
2305+   //  vtmp1 =  00  00  00  00  00  00  0p  0i
2306+   sve_compress_short (vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2307+   //  vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2308+   sve_uzp1 (vtmp1, B, vtmp1, vzr);
2309+ 
2310+   //  ptmp  = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2311+   sve_whilelt (ptmp, B, zr, rscratch2);
2312+   //  Compressed low:  dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2313+   //  Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2314+   //  Combine the compressed low with the compressed high:
2315+   //                   dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2316+   sve_splice (dst, B, ptmp, vtmp1);
23142317}
23152318
23162319void  C2_MacroAssembler::neon_reverse_bits (FloatRegister dst, FloatRegister src, BasicType bt, bool  isQ) {
0 commit comments