Skip to content

Commit

Permalink
Implement _mm256_permute2f128_ps and _mm256_permute2f128_pd intrinsics
Browse files Browse the repository at this point in the history
  • Loading branch information
bjorn3 committed Nov 5, 2023
1 parent f6a8c3a commit 6a53ace
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 19 deletions.
38 changes: 19 additions & 19 deletions src/intrinsics/llvm_x86.rs
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,12 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
}
}
}
"llvm.x86.avx2.vperm2i128" => {
"llvm.x86.avx2.vperm2i128"
| "llvm.x86.avx.vperm2f128.ps.256"
| "llvm.x86.avx.vperm2f128.pd.256" => {
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd
let (a, b, imm8) = match args {
[a, b, imm8] => (a, b, imm8),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
Expand All @@ -182,19 +186,11 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
let b = codegen_operand(fx, b);
let imm8 = codegen_operand(fx, imm8).load_scalar(fx);

let a_0 = a.value_lane(fx, 0).load_scalar(fx);
let a_1 = a.value_lane(fx, 1).load_scalar(fx);
let a_low = fx.bcx.ins().iconcat(a_0, a_1);
let a_2 = a.value_lane(fx, 2).load_scalar(fx);
let a_3 = a.value_lane(fx, 3).load_scalar(fx);
let a_high = fx.bcx.ins().iconcat(a_2, a_3);
let a_low = a.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx);
let a_high = a.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx);

let b_0 = b.value_lane(fx, 0).load_scalar(fx);
let b_1 = b.value_lane(fx, 1).load_scalar(fx);
let b_low = fx.bcx.ins().iconcat(b_0, b_1);
let b_2 = b.value_lane(fx, 2).load_scalar(fx);
let b_3 = b.value_lane(fx, 3).load_scalar(fx);
let b_high = fx.bcx.ins().iconcat(b_2, b_3);
let b_low = b.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx);
let b_high = b.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx);

fn select4(
fx: &mut FunctionCx<'_, '_, '_>,
Expand All @@ -219,16 +215,20 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(

let control0 = imm8;
let res_low = select4(fx, a_high, a_low, b_high, b_low, control0);
let (res_0, res_1) = fx.bcx.ins().isplit(res_low);

let control1 = fx.bcx.ins().ushr_imm(imm8, 4);
let res_high = select4(fx, a_high, a_low, b_high, b_low, control1);
let (res_2, res_3) = fx.bcx.ins().isplit(res_high);

ret.place_lane(fx, 0).to_ptr().store(fx, res_0, MemFlags::trusted());
ret.place_lane(fx, 1).to_ptr().store(fx, res_1, MemFlags::trusted());
ret.place_lane(fx, 2).to_ptr().store(fx, res_2, MemFlags::trusted());
ret.place_lane(fx, 3).to_ptr().store(fx, res_3, MemFlags::trusted());
ret.place_typed_lane(fx, fx.tcx.types.u128, 0).to_ptr().store(
fx,
res_low,
MemFlags::trusted(),
);
ret.place_typed_lane(fx, fx.tcx.types.u128, 1).to_ptr().store(
fx,
res_high,
MemFlags::trusted(),
);
}
"llvm.x86.ssse3.pabs.b.128" | "llvm.x86.ssse3.pabs.w.128" | "llvm.x86.ssse3.pabs.d.128" => {
let a = match args {
Expand Down
56 changes: 56 additions & 0 deletions src/value_and_place.rs
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,34 @@ impl<'tcx> CValue<'tcx> {
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
let lane_layout = fx.layout_of(lane_ty);
assert!(lane_idx < lane_count);

match self.0 {
CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(),
CValueInner::ByRef(ptr, None) => {
let field_offset = lane_layout.size * lane_idx;
let field_ptr = ptr.offset_i64(fx, i64::try_from(field_offset.bytes()).unwrap());
CValue::by_ref(field_ptr, lane_layout)
}
CValueInner::ByRef(_, Some(_)) => unreachable!(),
}
}

/// Like [`CValue::value_field`] except using the passed type as lane type instead of the one
/// specified by the vector type.
pub(crate) fn value_typed_lane(
self,
fx: &mut FunctionCx<'_, '_, 'tcx>,
lane_ty: Ty<'tcx>,
lane_idx: u64,
) -> CValue<'tcx> {
let layout = self.1;
assert!(layout.ty.is_simd());
let (orig_lane_count, orig_lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
let lane_layout = fx.layout_of(lane_ty);
assert!(
(lane_idx + 1) * lane_layout.size <= orig_lane_count * fx.layout_of(orig_lane_ty).size
);

match self.0 {
CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(),
CValueInner::ByRef(ptr, None) => {
Expand Down Expand Up @@ -734,6 +762,34 @@ impl<'tcx> CPlace<'tcx> {
}
}

/// Like [`CPlace::place_field`] except using the passed type as lane type instead of the one
/// specified by the vector type.
pub(crate) fn place_typed_lane(
self,
fx: &mut FunctionCx<'_, '_, 'tcx>,
lane_ty: Ty<'tcx>,
lane_idx: u64,
) -> CPlace<'tcx> {
let layout = self.layout();
assert!(layout.ty.is_simd());
let (orig_lane_count, orig_lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
let lane_layout = fx.layout_of(lane_ty);
assert!(
(lane_idx + 1) * lane_layout.size <= orig_lane_count * fx.layout_of(orig_lane_ty).size
);

match self.inner {
CPlaceInner::Var(_, _) => unreachable!(),
CPlaceInner::VarPair(_, _, _) => unreachable!(),
CPlaceInner::Addr(ptr, None) => {
let field_offset = lane_layout.size * lane_idx;
let field_ptr = ptr.offset_i64(fx, i64::try_from(field_offset.bytes()).unwrap());
CPlace::for_ptr(field_ptr, lane_layout)
}
CPlaceInner::Addr(_, Some(_)) => unreachable!(),
}
}

pub(crate) fn place_index(
self,
fx: &mut FunctionCx<'_, '_, 'tcx>,
Expand Down

0 comments on commit 6a53ace

Please sign in to comment.