Skip to content

Commit c9d23d2

Browse files
nominoloalexcrichton
authored andcommitted
Implement SSE _mm_load* instructions (rust-lang#99)
* Add _mm_loadh_pi * Add doctest for _mm_loadh_pi * Add _mm_loadl_pi * Add _mm_load_ss * Add _mm_load1_ps and _mm_load_ps1 * Add _mm_load_ps and _mm_loadu_ps * Add _mm_loadr_ps * Replace _mm_loadu_ps TODO with explanation * Tweak expected instructions for _mm_loadl/h_pi on x86 * Try fixing i586 test crash * Targets i586/i686 generate different code for _mm_loadh_pi
1 parent f1dd5a9 commit c9d23d2

File tree

1 file changed

+280
-0
lines changed

1 file changed

+280
-0
lines changed

Diff for: src/x86/sse.rs

+280
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
use simd_llvm::simd_shuffle4;
22
use v128::*;
3+
use v64::f32x2;
34
use std::os::raw::c_void;
5+
use std::mem;
6+
use std::ptr;
47

58
#[cfg(test)]
69
use stdsimd_test::assert_instr;
@@ -343,6 +346,201 @@ pub unsafe fn _mm_movemask_ps(a: f32x4) -> i32 {
343346
movmskps(a)
344347
}
345348

349+
/// Set the upper two single-precision floating-point values with 64 bits of
350+
/// data loaded from the address `p`; the lower two values are passed through
351+
/// from `a`.
352+
///
353+
/// This corresponds to the `MOVHPS` / `MOVHPD` / `VMOVHPD` instructions.
354+
///
355+
/// ```rust
356+
/// # #![feature(cfg_target_feature)]
357+
/// # #![feature(target_feature)]
358+
/// #
359+
/// # #[macro_use] extern crate stdsimd;
360+
/// #
361+
/// # // The real main function
362+
/// # fn main() {
363+
/// # if cfg_feature_enabled!("sse") {
364+
/// # #[target_feature = "+sse"]
365+
/// # fn worker() {
366+
/// #
367+
/// # use stdsimd::simd::f32x4;
368+
/// # use stdsimd::vendor::_mm_loadh_pi;
369+
/// #
370+
/// let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
371+
/// let data: [f32; 4] = [5.0, 6.0, 7.0, 8.0];
372+
///
373+
/// let r = unsafe { _mm_loadh_pi(a, data[..].as_ptr()) };
374+
///
375+
/// assert_eq!(r, f32x4::new(1.0, 2.0, 5.0, 6.0));
376+
/// #
377+
/// # }
378+
/// # worker();
379+
/// # }
380+
/// # }
381+
/// ```
382+
#[inline(always)]
383+
#[target_feature = "+sse"]
384+
// TODO: generates MOVHPD if the CPU supports SSE2.
385+
// #[cfg_attr(test, assert_instr(movhps))]
386+
#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movhpd))]
387+
// 32-bit codegen does not generate `movhps` or `movhpd`, but instead
388+
// `movsd` followed by `unpcklpd` (or `movss'/`unpcklps` if there's no SSE2).
389+
#[cfg_attr(all(test, target_arch = "x86", target_feature = "sse2"),
390+
assert_instr(unpcklpd))]
391+
#[cfg_attr(all(test, target_arch = "x86", not(target_feature = "sse2")),
392+
assert_instr(unpcklps))]
393+
// TODO: This function is actually not limited to floats, but that's what
394+
// what matches the C type most closely: (__m128, *const __m64) -> __m128
395+
pub unsafe fn _mm_loadh_pi(a: f32x4, p: *const f32) -> f32x4 {
396+
let q = p as *const f32x2;
397+
let b: f32x2 = *q;
398+
let bb = simd_shuffle4(b, b, [0, 1, 0, 1]);
399+
simd_shuffle4(a, bb, [0, 1, 4, 5])
400+
}
401+
402+
/// Load two floats from `p` into the lower half of a `f32x4`. The upper half
403+
/// is copied from the upper half of `a`.
404+
///
405+
/// This corresponds to the `MOVLPS` / `MOVLDP` / `VMOVLDP` instructions.
406+
///
407+
/// ```rust
408+
/// # #![feature(cfg_target_feature)]
409+
/// # #![feature(target_feature)]
410+
/// #
411+
/// # #[macro_use] extern crate stdsimd;
412+
/// #
413+
/// # // The real main function
414+
/// # fn main() {
415+
/// # if cfg_feature_enabled!("sse") {
416+
/// # #[target_feature = "+sse"]
417+
/// # fn worker() {
418+
/// #
419+
/// # use stdsimd::simd::f32x4;
420+
/// # use stdsimd::vendor::_mm_loadl_pi;
421+
/// #
422+
/// let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
423+
/// let data: [f32; 4] = [5.0, 6.0, 7.0, 8.0];
424+
///
425+
/// let r = unsafe { _mm_loadl_pi(a, data[..].as_ptr()) };
426+
///
427+
/// assert_eq!(r, f32x4::new(5.0, 6.0, 3.0, 4.0));
428+
/// #
429+
/// # }
430+
/// # worker();
431+
/// # }
432+
/// # }
433+
/// ```
434+
#[inline(always)]
435+
#[target_feature = "+sse"]
436+
// TODO: generates MOVLPD if the CPU supports SSE2.
437+
// #[cfg_attr(test, assert_instr(movlps))]
438+
#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movlpd))]
439+
// On 32-bit targets with SSE2, it just generates two `movsd`.
440+
#[cfg_attr(all(test, target_arch = "x86", target_feature = "sse2"),
441+
assert_instr(movsd))]
442+
// It should really generate "movlps", but oh well...
443+
#[cfg_attr(all(test, target_arch = "x86", not(target_feature = "sse2")),
444+
assert_instr(movss))]
445+
// TODO: Like _mm_loadh_pi, this also isn't limited to floats.
446+
pub unsafe fn _mm_loadl_pi(a: f32x4, p: *const f32) -> f32x4 {
447+
let q = p as *const f32x2;
448+
let b: f32x2 = *q;
449+
let bb = simd_shuffle4(b, b, [0, 1, 0, 1]);
450+
simd_shuffle4(a, bb, [4, 5, 2, 3])
451+
}
452+
453+
/// Construct a `f32x4` with the lowest element read from `p` and the other
454+
/// elements set to zero.
455+
///
456+
/// This corresponds to instructions `VMOVSS` / `MOVSS`.
457+
#[inline(always)]
458+
#[target_feature = "+sse"]
459+
#[cfg_attr(test, assert_instr(movss))]
460+
pub unsafe fn _mm_load_ss(p: *const f32) -> f32x4 {
461+
f32x4::new(*p, 0.0, 0.0, 0.0)
462+
}
463+
464+
/// Construct a `f32x4` by duplicating the value read from `p` into all
465+
/// elements.
466+
///
467+
/// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some
468+
/// shuffling.
469+
#[inline(always)]
470+
#[target_feature = "+sse"]
471+
#[cfg_attr(test, assert_instr(movss))]
472+
pub unsafe fn _mm_load1_ps(p: *const f32) -> f32x4 {
473+
let a = *p;
474+
f32x4::new(a, a, a, a)
475+
}
476+
477+
/// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html)
478+
#[inline(always)]
479+
#[target_feature = "+sse"]
480+
#[cfg_attr(test, assert_instr(movss))]
481+
pub unsafe fn _mm_load_ps1(p: *const f32) -> f32x4 {
482+
_mm_load1_ps(p)
483+
}
484+
485+
/// Load four `f32` values from *aligned* memory into a `f32x4`. If the pointer
486+
/// is not aligned to a 128-bit boundary (16 bytes) a general protection fault
487+
/// will be triggered (fatal program crash).
488+
///
489+
/// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned memory.
490+
///
491+
/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
492+
#[inline(always)]
493+
#[target_feature = "+sse"]
494+
#[cfg_attr(test, assert_instr(movaps))]
495+
pub unsafe fn _mm_load_ps(p: *const f32) -> f32x4 {
496+
*(p as *const f32x4)
497+
}
498+
499+
/// Load four `f32` values from memory into a `f32x4`. There are no restrictions
500+
/// on memory alignment. For aligned memory [`_mm_load_ps`](fn._mm_load_ps.html)
501+
/// may be faster.
502+
///
503+
/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
504+
#[inline(always)]
505+
#[target_feature = "+sse"]
506+
#[cfg_attr(test, assert_instr(movups))]
507+
pub unsafe fn _mm_loadu_ps(p: *const f32) -> f32x4 {
508+
// Note: Using `*p` would require `f32` alignment, but `movups` has no
509+
// alignment restrictions.
510+
let mut dst = f32x4::splat(mem::uninitialized());
511+
ptr::copy_nonoverlapping(
512+
p as *const u8,
513+
&mut dst as *mut f32x4 as *mut u8,
514+
mem::size_of::<f32x4>());
515+
dst
516+
}
517+
518+
/// Load four `f32` values from aligned memory into a `f32x4` in reverse order.
519+
///
520+
/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
521+
/// protection fault will be triggered (fatal program crash).
522+
///
523+
/// Functionally equivalent to the following code sequence (assuming `p`
524+
/// satisfies the alignment restrictions):
525+
///
526+
/// ```text
527+
/// let a0 = *p;
528+
/// let a1 = *p.offset(1);
529+
/// let a2 = *p.offset(2);
530+
/// let a3 = *p.offset(3);
531+
/// f32x4::new(a3, a2, a1, a0)
532+
/// ```
533+
///
534+
/// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some
535+
/// shuffling.
536+
#[inline(always)]
537+
#[target_feature = "+sse"]
538+
#[cfg_attr(test, assert_instr(movaps))]
539+
pub unsafe fn _mm_loadr_ps(p: *const f32) -> f32x4 {
540+
let a = _mm_load_ps(p);
541+
simd_shuffle4(a, a, [3, 2, 1, 0])
542+
}
543+
346544
/// Perform a serializing operation on all store-to-memory instructions that
347545
/// were issued prior to this instruction.
348546
///
@@ -938,6 +1136,88 @@ mod tests {
9381136
assert_eq!(r, f32x4::new(1.0, 2.0, 5.0, 6.0));
9391137
}
9401138

1139+
#[simd_test = "sse"]
1140+
unsafe fn _mm_loadh_pi() {
1141+
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
1142+
let x: [f32; 4] = [5.0, 6.0, 7.0, 8.0];
1143+
let p = x[..].as_ptr();
1144+
let r = sse::_mm_loadh_pi(a, p);
1145+
assert_eq!(r, f32x4::new(1.0, 2.0, 5.0, 6.0));
1146+
}
1147+
1148+
#[simd_test = "sse"]
1149+
unsafe fn _mm_loadl_pi() {
1150+
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
1151+
let x: [f32; 4] = [5.0, 6.0, 7.0, 8.0];
1152+
let p = x[..].as_ptr();
1153+
let r = sse::_mm_loadl_pi(a, p);
1154+
assert_eq!(r, f32x4::new(5.0, 6.0, 3.0, 4.0));
1155+
}
1156+
1157+
#[simd_test = "sse"]
1158+
unsafe fn _mm_load_ss() {
1159+
let a = 42.0f32;
1160+
let r = sse::_mm_load_ss(&a as *const f32);
1161+
assert_eq!(r, f32x4::new(42.0, 0.0, 0.0, 0.0));
1162+
}
1163+
1164+
#[simd_test = "sse"]
1165+
unsafe fn _mm_load1_ps() {
1166+
let a = 42.0f32;
1167+
let r = sse::_mm_load1_ps(&a as *const f32);
1168+
assert_eq!(r, f32x4::new(42.0, 42.0, 42.0, 42.0));
1169+
}
1170+
1171+
#[simd_test = "sse"]
1172+
unsafe fn _mm_load_ps() {
1173+
let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
1174+
1175+
let mut p = vals.as_ptr();
1176+
let mut fixup = 0.0f32;
1177+
1178+
// Make sure p is aligned, otherwise we might get a
1179+
// (signal: 11, SIGSEGV: invalid memory reference)
1180+
1181+
let unalignment = (p as usize) & 0xf;
1182+
if unalignment != 0 {
1183+
let delta = ((16 - unalignment) >> 2) as isize;
1184+
fixup = delta as f32;
1185+
p = p.offset(delta);
1186+
}
1187+
1188+
let r = sse::_mm_load_ps(p);
1189+
assert_eq!(r, f32x4::new(1.0, 2.0, 3.0, 4.0) + f32x4::splat(fixup));
1190+
}
1191+
1192+
#[simd_test = "sse"]
1193+
unsafe fn _mm_loadu_ps() {
1194+
let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
1195+
let p = vals.as_ptr().offset(3);
1196+
let r = sse::_mm_loadu_ps(black_box(p));
1197+
assert_eq!(r, f32x4::new(4.0, 5.0, 6.0, 7.0));
1198+
}
1199+
1200+
#[simd_test = "sse"]
1201+
unsafe fn _mm_loadr_ps() {
1202+
let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
1203+
1204+
let mut p = vals.as_ptr();
1205+
let mut fixup = 0.0f32;
1206+
1207+
// Make sure p is aligned, otherwise we might get a
1208+
// (signal: 11, SIGSEGV: invalid memory reference)
1209+
1210+
let unalignment = (p as usize) & 0xf;
1211+
if unalignment != 0 {
1212+
let delta = ((16 - unalignment) >> 2) as isize;
1213+
fixup = delta as f32;
1214+
p = p.offset(delta);
1215+
}
1216+
1217+
let r = sse::_mm_loadr_ps(p);
1218+
assert_eq!(r, f32x4::new(4.0, 3.0, 2.0, 1.0) + f32x4::splat(fixup));
1219+
}
1220+
9411221
#[simd_test = "sse"]
9421222
unsafe fn _mm_movemask_ps() {
9431223
let r = sse::_mm_movemask_ps(f32x4::new(-1.0, 5.0, -5.0, 0.0));

0 commit comments

Comments
 (0)