Skip to content

Commit 8649731

Browse files
committed
Implement all vendor intrinsics used by the simd-json crate
1 parent ecf79a3 commit 8649731

File tree

1 file changed

+138
-0
lines changed

1 file changed

+138
-0
lines changed

src/intrinsics/llvm_x86.rs

+138
Original file line numberDiff line numberDiff line change
@@ -590,6 +590,44 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
590590
}
591591
}
592592

593+
"llvm.x86.sse41.packusdw" => {
594+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32&ig_expand=4912
595+
intrinsic_args!(fx, args => (a, b); intrinsic);
596+
597+
assert_eq!(a.layout(), b.layout());
598+
let layout = a.layout();
599+
600+
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
601+
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
602+
assert_eq!(lane_ty, fx.tcx.types.i32);
603+
assert_eq!(ret_lane_ty, fx.tcx.types.u16);
604+
assert_eq!(lane_count * 2, ret_lane_count);
605+
606+
let min_u16 = fx.bcx.ins().iconst(types::I32, i64::from(u16::MIN));
607+
let max_u16 = fx.bcx.ins().iconst(types::I32, i64::from(u16::MAX));
608+
let ret_lane_layout = fx.layout_of(fx.tcx.types.u16);
609+
610+
for idx in 0..lane_count {
611+
let lane = a.value_lane(fx, idx).load_scalar(fx);
612+
let sat = fx.bcx.ins().umax(lane, min_u16);
613+
let sat = fx.bcx.ins().umin(sat, max_u16);
614+
let res = fx.bcx.ins().ireduce(types::I16, sat);
615+
616+
let res_lane = CValue::by_val(res, ret_lane_layout);
617+
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
618+
}
619+
620+
for idx in 0..lane_count {
621+
let lane = b.value_lane(fx, idx).load_scalar(fx);
622+
let sat = fx.bcx.ins().umax(lane, min_u16);
623+
let sat = fx.bcx.ins().umin(sat, max_u16);
624+
let res = fx.bcx.ins().ireduce(types::I16, sat);
625+
626+
let res_lane = CValue::by_val(res, ret_lane_layout);
627+
ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
628+
}
629+
}
630+
593631
"llvm.x86.avx2.packssdw" => {
594632
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32&ig_expand=4892
595633
intrinsic_args!(fx, args => (a, b); intrinsic);
@@ -648,6 +686,106 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
648686
}
649687
}
650688

689+
"llvm.x86.pclmulqdq" => {
690+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128&ig_expand=772
691+
intrinsic_args!(fx, args => (a, b, imm8); intrinsic);
692+
693+
assert_eq!(a.layout(), b.layout());
694+
let layout = a.layout();
695+
696+
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
697+
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
698+
assert_eq!(lane_ty, fx.tcx.types.i64);
699+
assert_eq!(ret_lane_ty, fx.tcx.types.i64);
700+
assert_eq!(lane_count, 2);
701+
assert_eq!(ret_lane_count, 2);
702+
703+
let imm8 = imm8.load_scalar(fx);
704+
705+
let control0 = fx.bcx.ins().band_imm(imm8, 0b0000_0001);
706+
let a_lane0 = a.value_lane(fx, 0).load_scalar(fx);
707+
let a_lane1 = a.value_lane(fx, 1).load_scalar(fx);
708+
let temp1 = fx.bcx.ins().select(control0, a_lane1, a_lane0);
709+
710+
let control4 = fx.bcx.ins().band_imm(imm8, 0b0001_0000);
711+
let b_lane0 = b.value_lane(fx, 0).load_scalar(fx);
712+
let b_lane1 = b.value_lane(fx, 1).load_scalar(fx);
713+
let temp2 = fx.bcx.ins().select(control4, b_lane1, b_lane0);
714+
715+
fn extract_bit(fx: &mut FunctionCx<'_, '_, '_>, val: Value, bit: i64) -> Value {
716+
let tmp = fx.bcx.ins().ushr_imm(val, bit);
717+
fx.bcx.ins().band_imm(tmp, 1)
718+
}
719+
720+
let mut res1 = fx.bcx.ins().iconst(types::I64, 0);
721+
for i in 0..=63 {
722+
let x = extract_bit(fx, temp1, 0);
723+
let y = extract_bit(fx, temp2, i);
724+
let mut temp = fx.bcx.ins().band(x, y);
725+
for j in 1..=i {
726+
let x = extract_bit(fx, temp1, j);
727+
let y = extract_bit(fx, temp2, i - j);
728+
let z = fx.bcx.ins().band(x, y);
729+
temp = fx.bcx.ins().bxor(temp, z);
730+
}
731+
let temp = fx.bcx.ins().ishl_imm(temp, i);
732+
res1 = fx.bcx.ins().bor(res1, temp);
733+
}
734+
ret.place_lane(fx, 0).to_ptr().store(fx, res1, MemFlags::trusted());
735+
736+
let mut res2 = fx.bcx.ins().iconst(types::I64, 0);
737+
for i in 64..=127 {
738+
let mut temp = fx.bcx.ins().iconst(types::I64, 0);
739+
for j in i - 63..=63 {
740+
let x = extract_bit(fx, temp1, j);
741+
let y = extract_bit(fx, temp2, i - j);
742+
let z = fx.bcx.ins().band(x, y);
743+
temp = fx.bcx.ins().bxor(temp, z);
744+
}
745+
let temp = fx.bcx.ins().ishl_imm(temp, i);
746+
res2 = fx.bcx.ins().bor(res2, temp);
747+
}
748+
ret.place_lane(fx, 1).to_ptr().store(fx, res2, MemFlags::trusted());
749+
}
750+
751+
"llvm.x86.avx.ptestz.256" => {
752+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256&ig_expand=6945
753+
intrinsic_args!(fx, args => (a, b); intrinsic);
754+
755+
assert_eq!(a.layout(), b.layout());
756+
let layout = a.layout();
757+
758+
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
759+
assert_eq!(lane_ty, fx.tcx.types.i64);
760+
assert_eq!(ret.layout().ty, fx.tcx.types.i32);
761+
assert_eq!(lane_count, 4);
762+
763+
let a_lane0 = a.value_lane(fx, 0).load_scalar(fx);
764+
let a_lane1 = a.value_lane(fx, 1).load_scalar(fx);
765+
let a_lane2 = a.value_lane(fx, 2).load_scalar(fx);
766+
let a_lane3 = a.value_lane(fx, 3).load_scalar(fx);
767+
let b_lane0 = b.value_lane(fx, 0).load_scalar(fx);
768+
let b_lane1 = b.value_lane(fx, 1).load_scalar(fx);
769+
let b_lane2 = b.value_lane(fx, 2).load_scalar(fx);
770+
let b_lane3 = b.value_lane(fx, 3).load_scalar(fx);
771+
772+
let zero0 = fx.bcx.ins().band(a_lane0, b_lane0);
773+
let zero1 = fx.bcx.ins().band(a_lane1, b_lane1);
774+
let zero2 = fx.bcx.ins().band(a_lane2, b_lane2);
775+
let zero3 = fx.bcx.ins().band(a_lane3, b_lane3);
776+
777+
let all_zero0 = fx.bcx.ins().bor(zero0, zero1);
778+
let all_zero1 = fx.bcx.ins().bor(zero2, zero3);
779+
let all_zero = fx.bcx.ins().bor(all_zero0, all_zero1);
780+
781+
let res = fx.bcx.ins().icmp_imm(IntCC::Equal, all_zero, 0);
782+
let res = CValue::by_val(
783+
fx.bcx.ins().uextend(types::I32, res),
784+
fx.layout_of(fx.tcx.types.i32),
785+
);
786+
ret.write_cvalue(fx, res);
787+
}
788+
651789
_ => {
652790
fx.tcx
653791
.sess

0 commit comments

Comments
 (0)