@@ -2248,41 +2248,80 @@ static void float_to_float16_v_slow_path(C2_MacroAssembler& masm,
22482248#define __ masm.
22492249 VectorRegister dst = stub.data <0 >();
22502250 VectorRegister src = stub.data <1 >();
2251- VectorRegister tmp = stub.data <2 >();
2251+ VectorRegister vtmp = stub.data <2 >();
2252+ assert_different_registers (dst, src, vtmp);
2253+
22522254 __ bind (stub.entry ());
22532255
2256+ // Active elements (NaNs) are marked in v0 mask register.
22542257 // mul is already set to mf2 in float_to_float16_v.
22552258
2256- // preserve the payloads of non-canonical NaNs.
2257- __ vnsra_wi (dst, src, 13 , Assembler::v0_t );
2258-
2259- // preserve the sign bit.
2260- __ vnsra_wi (tmp, src, 26 , Assembler::v0_t );
2261- __ vsll_vi (tmp, tmp, 10 , Assembler::v0_t );
2262- __ mv (t0, 0x3ff );
2263- __ vor_vx (tmp, tmp, t0, Assembler::v0_t );
2264-
2265- // get the result by merging sign bit and payloads of preserved non-canonical NaNs.
2266- __ vand_vv (dst, dst, tmp, Assembler::v0_t );
2259+ // Float (32 bits)
2260+ // Bit: 31 30 to 23 22 to 0
2261+ // +---+------------------+-----------------------------+
2262+ // | S | Exponent | Mantissa (Fraction) |
2263+ // +---+------------------+-----------------------------+
2264+ // 1 bit 8 bits 23 bits
2265+ //
2266+ // Float (16 bits)
2267+ // Bit: 15 14 to 10 9 to 0
2268+ // +---+----------------+------------------+
2269+ // | S | Exponent | Mantissa |
2270+ // +---+----------------+------------------+
2271+ // 1 bit 5 bits 10 bits
2272+ const int fp_sign_bits = 1 ;
2273+ const int fp32_bits = 32 ;
2274+ const int fp32_mantissa_2nd_part_bits = 9 ;
2275+ const int fp32_mantissa_3rd_part_bits = 4 ;
2276+ const int fp16_exponent_bits = 5 ;
2277+ const int fp16_mantissa_bits = 10 ;
2278+
2279+ // preserve the sign bit and exponent, clear mantissa.
2280+ __ vnsra_wi (dst, src, fp32_bits - fp_sign_bits - fp16_exponent_bits, Assembler::v0_t );
2281+ __ vsll_vi (dst, dst, fp16_mantissa_bits, Assembler::v0_t );
2282+
2283+ // Preserve high order bit of float NaN in the
2284+ // binary16 result NaN (tenth bit); OR in remaining
2285+ // bits into lower 9 bits of binary 16 significand.
2286+ // | (doppel & 0x007f_e000) >> 13 // 10 bits
2287+ // | (doppel & 0x0000_1ff0) >> 4 // 9 bits
2288+ // | (doppel & 0x0000_000f)); // 4 bits
2289+ //
2290+ // Check j.l.Float.floatToFloat16 for more information.
2291+ // 10 bits
2292+ __ vnsrl_wi (vtmp, src, fp32_mantissa_2nd_part_bits + fp32_mantissa_3rd_part_bits, Assembler::v0_t );
2293+ __ mv (t0, 0x3ff ); // retain first part of mantissa in a float 32
2294+ __ vand_vx (vtmp, vtmp, t0, Assembler::v0_t );
2295+ __ vor_vv (dst, dst, vtmp, Assembler::v0_t );
2296+ // 9 bits
2297+ __ vnsrl_wi (vtmp, src, fp32_mantissa_3rd_part_bits, Assembler::v0_t );
2298+ __ mv (t0, 0x1ff ); // retain second part of mantissa in a float 32
2299+ __ vand_vx (vtmp, vtmp, t0, Assembler::v0_t );
2300+ __ vor_vv (dst, dst, vtmp, Assembler::v0_t );
2301+ // 4 bits
2302+ // Narrow shift is necessary to move data from 32 bits element to 16 bits element in vector register.
2303+ __ vnsrl_wi (vtmp, src, 0 , Assembler::v0_t );
2304+ __ vand_vi (vtmp, vtmp, 0xf , Assembler::v0_t );
2305+ __ vor_vv (dst, dst, vtmp, Assembler::v0_t );
22672306
22682307 __ j (stub.continuation ());
22692308#undef __
22702309}
22712310
22722311// j.l.Float.float16ToFloat
2273- void C2_MacroAssembler::float_to_float16_v (VectorRegister dst, VectorRegister src, VectorRegister vtmp,
2274- Register tmp, uint vector_length) {
2312+ void C2_MacroAssembler::float_to_float16_v (VectorRegister dst, VectorRegister src,
2313+ VectorRegister vtmp, Register tmp, uint vector_length) {
22752314 assert_different_registers (dst, src, vtmp);
22762315
22772316 auto stub = C2CodeStub::make<VectorRegister, VectorRegister, VectorRegister>
2278- (dst, src, vtmp, 28 , float_to_float16_v_slow_path);
2317+ (dst, src, vtmp, 56 , float_to_float16_v_slow_path);
22792318
22802319 // On riscv, NaN needs a special process as vfncvt_f_f_w does not work in that case.
22812320
22822321 vsetvli_helper (BasicType::T_FLOAT, vector_length, Assembler::m1);
22832322
22842323 // check whether there is a NaN.
2285- // replace v_fclass with vmseq_vv as performance optimization.
2324+ // replace v_fclass with vmfne_vv as performance optimization.
22862325 vmfne_vv (v0, src, src);
22872326 vcpop_m (t0, v0);
22882327
0 commit comments