Skip to content

Commit ecf79a3

Browse files
committedNov 7, 2023
Implement all vendor intrinsics used by the fimg crate
1 parent 0a35232 commit ecf79a3

File tree

1 file changed

+154
-0
lines changed

1 file changed

+154
-0
lines changed
 

‎src/intrinsics/llvm_x86.rs

+154
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,160 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
494494
}
495495
}
496496

497+
"llvm.x86.avx2.packuswb" => {
498+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16&ig_expand=4906
499+
intrinsic_args!(fx, args => (a, b); intrinsic);
500+
501+
assert_eq!(a.layout(), b.layout());
502+
let layout = a.layout();
503+
504+
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
505+
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
506+
assert_eq!(lane_ty, fx.tcx.types.i16);
507+
assert_eq!(ret_lane_ty, fx.tcx.types.u8);
508+
assert_eq!(lane_count * 2, ret_lane_count);
509+
510+
let zero = fx.bcx.ins().iconst(types::I16, 0);
511+
let max_u8 = fx.bcx.ins().iconst(types::I16, 255);
512+
let ret_lane_layout = fx.layout_of(fx.tcx.types.u8);
513+
514+
for idx in 0..lane_count / 2 {
515+
let lane = a.value_lane(fx, idx).load_scalar(fx);
516+
let sat = fx.bcx.ins().smax(lane, zero);
517+
let sat = fx.bcx.ins().umin(sat, max_u8);
518+
let res = fx.bcx.ins().ireduce(types::I8, sat);
519+
520+
let res_lane = CValue::by_val(res, ret_lane_layout);
521+
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
522+
}
523+
524+
for idx in 0..lane_count / 2 {
525+
let lane = b.value_lane(fx, idx).load_scalar(fx);
526+
let sat = fx.bcx.ins().smax(lane, zero);
527+
let sat = fx.bcx.ins().umin(sat, max_u8);
528+
let res = fx.bcx.ins().ireduce(types::I8, sat);
529+
530+
let res_lane = CValue::by_val(res, ret_lane_layout);
531+
ret.place_lane(fx, lane_count / 2 + idx).write_cvalue(fx, res_lane);
532+
}
533+
534+
for idx in 0..lane_count / 2 {
535+
let lane = a.value_lane(fx, idx).load_scalar(fx);
536+
let sat = fx.bcx.ins().smax(lane, zero);
537+
let sat = fx.bcx.ins().umin(sat, max_u8);
538+
let res = fx.bcx.ins().ireduce(types::I8, sat);
539+
540+
let res_lane = CValue::by_val(res, ret_lane_layout);
541+
ret.place_lane(fx, lane_count / 2 * 2 + idx).write_cvalue(fx, res_lane);
542+
}
543+
544+
for idx in 0..lane_count / 2 {
545+
let lane = b.value_lane(fx, idx).load_scalar(fx);
546+
let sat = fx.bcx.ins().smax(lane, zero);
547+
let sat = fx.bcx.ins().umin(sat, max_u8);
548+
let res = fx.bcx.ins().ireduce(types::I8, sat);
549+
550+
let res_lane = CValue::by_val(res, ret_lane_layout);
551+
ret.place_lane(fx, lane_count / 2 * 3 + idx).write_cvalue(fx, res_lane);
552+
}
553+
}
554+
555+
"llvm.x86.sse2.packssdw.128" => {
556+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32&ig_expand=4889
557+
intrinsic_args!(fx, args => (a, b); intrinsic);
558+
559+
assert_eq!(a.layout(), b.layout());
560+
let layout = a.layout();
561+
562+
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
563+
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
564+
assert_eq!(lane_ty, fx.tcx.types.i32);
565+
assert_eq!(ret_lane_ty, fx.tcx.types.i16);
566+
assert_eq!(lane_count * 2, ret_lane_count);
567+
568+
let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16));
569+
let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16));
570+
let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
571+
572+
for idx in 0..lane_count {
573+
let lane = a.value_lane(fx, idx).load_scalar(fx);
574+
let sat = fx.bcx.ins().smax(lane, min_i16);
575+
let sat = fx.bcx.ins().umin(sat, max_i16);
576+
let res = fx.bcx.ins().ireduce(types::I16, sat);
577+
578+
let res_lane = CValue::by_val(res, ret_lane_layout);
579+
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
580+
}
581+
582+
for idx in 0..lane_count {
583+
let lane = b.value_lane(fx, idx).load_scalar(fx);
584+
let sat = fx.bcx.ins().smax(lane, min_i16);
585+
let sat = fx.bcx.ins().umin(sat, max_i16);
586+
let res = fx.bcx.ins().ireduce(types::I16, sat);
587+
588+
let res_lane = CValue::by_val(res, ret_lane_layout);
589+
ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
590+
}
591+
}
592+
593+
"llvm.x86.avx2.packssdw" => {
594+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32&ig_expand=4892
595+
intrinsic_args!(fx, args => (a, b); intrinsic);
596+
597+
assert_eq!(a.layout(), b.layout());
598+
let layout = a.layout();
599+
600+
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
601+
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
602+
assert_eq!(lane_ty, fx.tcx.types.i32);
603+
assert_eq!(ret_lane_ty, fx.tcx.types.i16);
604+
assert_eq!(lane_count * 2, ret_lane_count);
605+
606+
let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16));
607+
let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16));
608+
let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
609+
610+
for idx in 0..lane_count / 2 {
611+
let lane = a.value_lane(fx, idx).load_scalar(fx);
612+
let sat = fx.bcx.ins().smax(lane, min_i16);
613+
let sat = fx.bcx.ins().umin(sat, max_i16);
614+
let res = fx.bcx.ins().ireduce(types::I16, sat);
615+
616+
let res_lane = CValue::by_val(res, ret_lane_layout);
617+
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
618+
}
619+
620+
for idx in 0..lane_count / 2 {
621+
let lane = b.value_lane(fx, idx).load_scalar(fx);
622+
let sat = fx.bcx.ins().smax(lane, min_i16);
623+
let sat = fx.bcx.ins().umin(sat, max_i16);
624+
let res = fx.bcx.ins().ireduce(types::I16, sat);
625+
626+
let res_lane = CValue::by_val(res, ret_lane_layout);
627+
ret.place_lane(fx, lane_count / 2 + idx).write_cvalue(fx, res_lane);
628+
}
629+
630+
for idx in 0..lane_count / 2 {
631+
let lane = a.value_lane(fx, idx).load_scalar(fx);
632+
let sat = fx.bcx.ins().smax(lane, min_i16);
633+
let sat = fx.bcx.ins().umin(sat, max_i16);
634+
let res = fx.bcx.ins().ireduce(types::I16, sat);
635+
636+
let res_lane = CValue::by_val(res, ret_lane_layout);
637+
ret.place_lane(fx, lane_count / 2 * 2 + idx).write_cvalue(fx, res_lane);
638+
}
639+
640+
for idx in 0..lane_count / 2 {
641+
let lane = b.value_lane(fx, idx).load_scalar(fx);
642+
let sat = fx.bcx.ins().smax(lane, min_i16);
643+
let sat = fx.bcx.ins().umin(sat, max_i16);
644+
let res = fx.bcx.ins().ireduce(types::I16, sat);
645+
646+
let res_lane = CValue::by_val(res, ret_lane_layout);
647+
ret.place_lane(fx, lane_count / 2 * 3 + idx).write_cvalue(fx, res_lane);
648+
}
649+
}
650+
497651
_ => {
498652
fx.tcx
499653
.sess

0 commit comments

Comments
 (0)
Please sign in to comment.