@@ -590,6 +590,44 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
590
590
}
591
591
}
592
592
593
+ "llvm.x86.sse41.packusdw" => {
594
+ // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32&ig_expand=4912
595
+ intrinsic_args ! ( fx, args => ( a, b) ; intrinsic) ;
596
+
597
+ assert_eq ! ( a. layout( ) , b. layout( ) ) ;
598
+ let layout = a. layout ( ) ;
599
+
600
+ let ( lane_count, lane_ty) = layout. ty . simd_size_and_type ( fx. tcx ) ;
601
+ let ( ret_lane_count, ret_lane_ty) = ret. layout ( ) . ty . simd_size_and_type ( fx. tcx ) ;
602
+ assert_eq ! ( lane_ty, fx. tcx. types. i32 ) ;
603
+ assert_eq ! ( ret_lane_ty, fx. tcx. types. u16 ) ;
604
+ assert_eq ! ( lane_count * 2 , ret_lane_count) ;
605
+
606
+ let min_u16 = fx. bcx . ins ( ) . iconst ( types:: I32 , i64:: from ( u16:: MIN ) ) ;
607
+ let max_u16 = fx. bcx . ins ( ) . iconst ( types:: I32 , i64:: from ( u16:: MAX ) ) ;
608
+ let ret_lane_layout = fx. layout_of ( fx. tcx . types . u16 ) ;
609
+
610
+ for idx in 0 ..lane_count {
611
+ let lane = a. value_lane ( fx, idx) . load_scalar ( fx) ;
612
+ let sat = fx. bcx . ins ( ) . umax ( lane, min_u16) ;
613
+ let sat = fx. bcx . ins ( ) . umin ( sat, max_u16) ;
614
+ let res = fx. bcx . ins ( ) . ireduce ( types:: I16 , sat) ;
615
+
616
+ let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
617
+ ret. place_lane ( fx, idx) . write_cvalue ( fx, res_lane) ;
618
+ }
619
+
620
+ for idx in 0 ..lane_count {
621
+ let lane = b. value_lane ( fx, idx) . load_scalar ( fx) ;
622
+ let sat = fx. bcx . ins ( ) . umax ( lane, min_u16) ;
623
+ let sat = fx. bcx . ins ( ) . umin ( sat, max_u16) ;
624
+ let res = fx. bcx . ins ( ) . ireduce ( types:: I16 , sat) ;
625
+
626
+ let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
627
+ ret. place_lane ( fx, lane_count + idx) . write_cvalue ( fx, res_lane) ;
628
+ }
629
+ }
630
+
593
631
"llvm.x86.avx2.packssdw" => {
594
632
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32&ig_expand=4892
595
633
intrinsic_args ! ( fx, args => ( a, b) ; intrinsic) ;
@@ -648,6 +686,106 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
648
686
}
649
687
}
650
688
689
+ "llvm.x86.pclmulqdq" => {
690
+ // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128&ig_expand=772
691
+ intrinsic_args ! ( fx, args => ( a, b, imm8) ; intrinsic) ;
692
+
693
+ assert_eq ! ( a. layout( ) , b. layout( ) ) ;
694
+ let layout = a. layout ( ) ;
695
+
696
+ let ( lane_count, lane_ty) = layout. ty . simd_size_and_type ( fx. tcx ) ;
697
+ let ( ret_lane_count, ret_lane_ty) = ret. layout ( ) . ty . simd_size_and_type ( fx. tcx ) ;
698
+ assert_eq ! ( lane_ty, fx. tcx. types. i64 ) ;
699
+ assert_eq ! ( ret_lane_ty, fx. tcx. types. i64 ) ;
700
+ assert_eq ! ( lane_count, 2 ) ;
701
+ assert_eq ! ( ret_lane_count, 2 ) ;
702
+
703
+ let imm8 = imm8. load_scalar ( fx) ;
704
+
705
+ let control0 = fx. bcx . ins ( ) . band_imm ( imm8, 0b0000_0001 ) ;
706
+ let a_lane0 = a. value_lane ( fx, 0 ) . load_scalar ( fx) ;
707
+ let a_lane1 = a. value_lane ( fx, 1 ) . load_scalar ( fx) ;
708
+ let temp1 = fx. bcx . ins ( ) . select ( control0, a_lane1, a_lane0) ;
709
+
710
+ let control4 = fx. bcx . ins ( ) . band_imm ( imm8, 0b0001_0000 ) ;
711
+ let b_lane0 = b. value_lane ( fx, 0 ) . load_scalar ( fx) ;
712
+ let b_lane1 = b. value_lane ( fx, 1 ) . load_scalar ( fx) ;
713
+ let temp2 = fx. bcx . ins ( ) . select ( control4, b_lane1, b_lane0) ;
714
+
715
+ fn extract_bit ( fx : & mut FunctionCx < ' _ , ' _ , ' _ > , val : Value , bit : i64 ) -> Value {
716
+ let tmp = fx. bcx . ins ( ) . ushr_imm ( val, bit) ;
717
+ fx. bcx . ins ( ) . band_imm ( tmp, 1 )
718
+ }
719
+
720
+ let mut res1 = fx. bcx . ins ( ) . iconst ( types:: I64 , 0 ) ;
721
+ for i in 0 ..=63 {
722
+ let x = extract_bit ( fx, temp1, 0 ) ;
723
+ let y = extract_bit ( fx, temp2, i) ;
724
+ let mut temp = fx. bcx . ins ( ) . band ( x, y) ;
725
+ for j in 1 ..=i {
726
+ let x = extract_bit ( fx, temp1, j) ;
727
+ let y = extract_bit ( fx, temp2, i - j) ;
728
+ let z = fx. bcx . ins ( ) . band ( x, y) ;
729
+ temp = fx. bcx . ins ( ) . bxor ( temp, z) ;
730
+ }
731
+ let temp = fx. bcx . ins ( ) . ishl_imm ( temp, i) ;
732
+ res1 = fx. bcx . ins ( ) . bor ( res1, temp) ;
733
+ }
734
+ ret. place_lane ( fx, 0 ) . to_ptr ( ) . store ( fx, res1, MemFlags :: trusted ( ) ) ;
735
+
736
+ let mut res2 = fx. bcx . ins ( ) . iconst ( types:: I64 , 0 ) ;
737
+ for i in 64 ..=127 {
738
+ let mut temp = fx. bcx . ins ( ) . iconst ( types:: I64 , 0 ) ;
739
+ for j in i - 63 ..=63 {
740
+ let x = extract_bit ( fx, temp1, j) ;
741
+ let y = extract_bit ( fx, temp2, i - j) ;
742
+ let z = fx. bcx . ins ( ) . band ( x, y) ;
743
+ temp = fx. bcx . ins ( ) . bxor ( temp, z) ;
744
+ }
745
+ let temp = fx. bcx . ins ( ) . ishl_imm ( temp, i) ;
746
+ res2 = fx. bcx . ins ( ) . bor ( res2, temp) ;
747
+ }
748
+ ret. place_lane ( fx, 1 ) . to_ptr ( ) . store ( fx, res2, MemFlags :: trusted ( ) ) ;
749
+ }
750
+
751
+ "llvm.x86.avx.ptestz.256" => {
752
+ // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256&ig_expand=6945
753
+ intrinsic_args ! ( fx, args => ( a, b) ; intrinsic) ;
754
+
755
+ assert_eq ! ( a. layout( ) , b. layout( ) ) ;
756
+ let layout = a. layout ( ) ;
757
+
758
+ let ( lane_count, lane_ty) = layout. ty . simd_size_and_type ( fx. tcx ) ;
759
+ assert_eq ! ( lane_ty, fx. tcx. types. i64 ) ;
760
+ assert_eq ! ( ret. layout( ) . ty, fx. tcx. types. i32 ) ;
761
+ assert_eq ! ( lane_count, 4 ) ;
762
+
763
+ let a_lane0 = a. value_lane ( fx, 0 ) . load_scalar ( fx) ;
764
+ let a_lane1 = a. value_lane ( fx, 1 ) . load_scalar ( fx) ;
765
+ let a_lane2 = a. value_lane ( fx, 2 ) . load_scalar ( fx) ;
766
+ let a_lane3 = a. value_lane ( fx, 3 ) . load_scalar ( fx) ;
767
+ let b_lane0 = b. value_lane ( fx, 0 ) . load_scalar ( fx) ;
768
+ let b_lane1 = b. value_lane ( fx, 1 ) . load_scalar ( fx) ;
769
+ let b_lane2 = b. value_lane ( fx, 2 ) . load_scalar ( fx) ;
770
+ let b_lane3 = b. value_lane ( fx, 3 ) . load_scalar ( fx) ;
771
+
772
+ let zero0 = fx. bcx . ins ( ) . band ( a_lane0, b_lane0) ;
773
+ let zero1 = fx. bcx . ins ( ) . band ( a_lane1, b_lane1) ;
774
+ let zero2 = fx. bcx . ins ( ) . band ( a_lane2, b_lane2) ;
775
+ let zero3 = fx. bcx . ins ( ) . band ( a_lane3, b_lane3) ;
776
+
777
+ let all_zero0 = fx. bcx . ins ( ) . bor ( zero0, zero1) ;
778
+ let all_zero1 = fx. bcx . ins ( ) . bor ( zero2, zero3) ;
779
+ let all_zero = fx. bcx . ins ( ) . bor ( all_zero0, all_zero1) ;
780
+
781
+ let res = fx. bcx . ins ( ) . icmp_imm ( IntCC :: Equal , all_zero, 0 ) ;
782
+ let res = CValue :: by_val (
783
+ fx. bcx . ins ( ) . uextend ( types:: I32 , res) ,
784
+ fx. layout_of ( fx. tcx . types . i32 ) ,
785
+ ) ;
786
+ ret. write_cvalue ( fx, res) ;
787
+ }
788
+
651
789
_ => {
652
790
fx. tcx
653
791
. sess
0 commit comments