|
1 | 1 | use simd_llvm::simd_shuffle4;
|
2 | 2 | use v128::*;
|
| 3 | +use v64::f32x2; |
3 | 4 | use std::os::raw::c_void;
|
| 5 | +use std::mem; |
| 6 | +use std::ptr; |
4 | 7 |
|
5 | 8 | #[cfg(test)]
|
6 | 9 | use stdsimd_test::assert_instr;
|
@@ -343,6 +346,201 @@ pub unsafe fn _mm_movemask_ps(a: f32x4) -> i32 {
|
343 | 346 | movmskps(a)
|
344 | 347 | }
|
345 | 348 |
|
| 349 | +/// Set the upper two single-precision floating-point values with 64 bits of |
| 350 | +/// data loaded from the address `p`; the lower two values are passed through |
| 351 | +/// from `a`. |
| 352 | +/// |
| 353 | +/// This corresponds to the `MOVHPS` / `MOVHPD` / `VMOVHPD` instructions. |
| 354 | +/// |
| 355 | +/// ```rust |
| 356 | +/// # #![feature(cfg_target_feature)] |
| 357 | +/// # #![feature(target_feature)] |
| 358 | +/// # |
| 359 | +/// # #[macro_use] extern crate stdsimd; |
| 360 | +/// # |
| 361 | +/// # // The real main function |
| 362 | +/// # fn main() { |
| 363 | +/// # if cfg_feature_enabled!("sse") { |
| 364 | +/// # #[target_feature = "+sse"] |
| 365 | +/// # fn worker() { |
| 366 | +/// # |
| 367 | +/// # use stdsimd::simd::f32x4; |
| 368 | +/// # use stdsimd::vendor::_mm_loadh_pi; |
| 369 | +/// # |
| 370 | +/// let a = f32x4::new(1.0, 2.0, 3.0, 4.0); |
| 371 | +/// let data: [f32; 4] = [5.0, 6.0, 7.0, 8.0]; |
| 372 | +/// |
| 373 | +/// let r = unsafe { _mm_loadh_pi(a, data[..].as_ptr()) }; |
| 374 | +/// |
| 375 | +/// assert_eq!(r, f32x4::new(1.0, 2.0, 5.0, 6.0)); |
| 376 | +/// # |
| 377 | +/// # } |
| 378 | +/// # worker(); |
| 379 | +/// # } |
| 380 | +/// # } |
| 381 | +/// ``` |
| 382 | +#[inline(always)] |
| 383 | +#[target_feature = "+sse"] |
| 384 | +// TODO: generates MOVHPD if the CPU supports SSE2. |
| 385 | +// #[cfg_attr(test, assert_instr(movhps))] |
| 386 | +#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movhpd))] |
| 387 | +// 32-bit codegen does not generate `movhps` or `movhpd`, but instead |
| 388 | +// `movsd` followed by `unpcklpd` (or `movss'/`unpcklps` if there's no SSE2). |
| 389 | +#[cfg_attr(all(test, target_arch = "x86", target_feature = "sse2"), |
| 390 | + assert_instr(unpcklpd))] |
| 391 | +#[cfg_attr(all(test, target_arch = "x86", not(target_feature = "sse2")), |
| 392 | + assert_instr(unpcklps))] |
| 393 | +// TODO: This function is actually not limited to floats, but that's what |
| 394 | +// what matches the C type most closely: (__m128, *const __m64) -> __m128 |
| 395 | +pub unsafe fn _mm_loadh_pi(a: f32x4, p: *const f32) -> f32x4 { |
| 396 | + let q = p as *const f32x2; |
| 397 | + let b: f32x2 = *q; |
| 398 | + let bb = simd_shuffle4(b, b, [0, 1, 0, 1]); |
| 399 | + simd_shuffle4(a, bb, [0, 1, 4, 5]) |
| 400 | +} |
| 401 | + |
| 402 | +/// Load two floats from `p` into the lower half of a `f32x4`. The upper half |
| 403 | +/// is copied from the upper half of `a`. |
| 404 | +/// |
| 405 | +/// This corresponds to the `MOVLPS` / `MOVLDP` / `VMOVLDP` instructions. |
| 406 | +/// |
| 407 | +/// ```rust |
| 408 | +/// # #![feature(cfg_target_feature)] |
| 409 | +/// # #![feature(target_feature)] |
| 410 | +/// # |
| 411 | +/// # #[macro_use] extern crate stdsimd; |
| 412 | +/// # |
| 413 | +/// # // The real main function |
| 414 | +/// # fn main() { |
| 415 | +/// # if cfg_feature_enabled!("sse") { |
| 416 | +/// # #[target_feature = "+sse"] |
| 417 | +/// # fn worker() { |
| 418 | +/// # |
| 419 | +/// # use stdsimd::simd::f32x4; |
| 420 | +/// # use stdsimd::vendor::_mm_loadl_pi; |
| 421 | +/// # |
| 422 | +/// let a = f32x4::new(1.0, 2.0, 3.0, 4.0); |
| 423 | +/// let data: [f32; 4] = [5.0, 6.0, 7.0, 8.0]; |
| 424 | +/// |
| 425 | +/// let r = unsafe { _mm_loadl_pi(a, data[..].as_ptr()) }; |
| 426 | +/// |
| 427 | +/// assert_eq!(r, f32x4::new(5.0, 6.0, 3.0, 4.0)); |
| 428 | +/// # |
| 429 | +/// # } |
| 430 | +/// # worker(); |
| 431 | +/// # } |
| 432 | +/// # } |
| 433 | +/// ``` |
| 434 | +#[inline(always)] |
| 435 | +#[target_feature = "+sse"] |
| 436 | +// TODO: generates MOVLPD if the CPU supports SSE2. |
| 437 | +// #[cfg_attr(test, assert_instr(movlps))] |
| 438 | +#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movlpd))] |
| 439 | +// On 32-bit targets with SSE2, it just generates two `movsd`. |
| 440 | +#[cfg_attr(all(test, target_arch = "x86", target_feature = "sse2"), |
| 441 | + assert_instr(movsd))] |
| 442 | +// It should really generate "movlps", but oh well... |
| 443 | +#[cfg_attr(all(test, target_arch = "x86", not(target_feature = "sse2")), |
| 444 | + assert_instr(movss))] |
| 445 | +// TODO: Like _mm_loadh_pi, this also isn't limited to floats. |
| 446 | +pub unsafe fn _mm_loadl_pi(a: f32x4, p: *const f32) -> f32x4 { |
| 447 | + let q = p as *const f32x2; |
| 448 | + let b: f32x2 = *q; |
| 449 | + let bb = simd_shuffle4(b, b, [0, 1, 0, 1]); |
| 450 | + simd_shuffle4(a, bb, [4, 5, 2, 3]) |
| 451 | +} |
| 452 | + |
| 453 | +/// Construct a `f32x4` with the lowest element read from `p` and the other |
| 454 | +/// elements set to zero. |
| 455 | +/// |
| 456 | +/// This corresponds to instructions `VMOVSS` / `MOVSS`. |
| 457 | +#[inline(always)] |
| 458 | +#[target_feature = "+sse"] |
| 459 | +#[cfg_attr(test, assert_instr(movss))] |
| 460 | +pub unsafe fn _mm_load_ss(p: *const f32) -> f32x4 { |
| 461 | + f32x4::new(*p, 0.0, 0.0, 0.0) |
| 462 | +} |
| 463 | + |
| 464 | +/// Construct a `f32x4` by duplicating the value read from `p` into all |
| 465 | +/// elements. |
| 466 | +/// |
| 467 | +/// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some |
| 468 | +/// shuffling. |
| 469 | +#[inline(always)] |
| 470 | +#[target_feature = "+sse"] |
| 471 | +#[cfg_attr(test, assert_instr(movss))] |
| 472 | +pub unsafe fn _mm_load1_ps(p: *const f32) -> f32x4 { |
| 473 | + let a = *p; |
| 474 | + f32x4::new(a, a, a, a) |
| 475 | +} |
| 476 | + |
| 477 | +/// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html) |
| 478 | +#[inline(always)] |
| 479 | +#[target_feature = "+sse"] |
| 480 | +#[cfg_attr(test, assert_instr(movss))] |
| 481 | +pub unsafe fn _mm_load_ps1(p: *const f32) -> f32x4 { |
| 482 | + _mm_load1_ps(p) |
| 483 | +} |
| 484 | + |
| 485 | +/// Load four `f32` values from *aligned* memory into a `f32x4`. If the pointer |
| 486 | +/// is not aligned to a 128-bit boundary (16 bytes) a general protection fault |
| 487 | +/// will be triggered (fatal program crash). |
| 488 | +/// |
| 489 | +/// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned memory. |
| 490 | +/// |
| 491 | +/// This corresponds to instructions `VMOVAPS` / `MOVAPS`. |
| 492 | +#[inline(always)] |
| 493 | +#[target_feature = "+sse"] |
| 494 | +#[cfg_attr(test, assert_instr(movaps))] |
| 495 | +pub unsafe fn _mm_load_ps(p: *const f32) -> f32x4 { |
| 496 | + *(p as *const f32x4) |
| 497 | +} |
| 498 | + |
| 499 | +/// Load four `f32` values from memory into a `f32x4`. There are no restrictions |
| 500 | +/// on memory alignment. For aligned memory [`_mm_load_ps`](fn._mm_load_ps.html) |
| 501 | +/// may be faster. |
| 502 | +/// |
| 503 | +/// This corresponds to instructions `VMOVUPS` / `MOVUPS`. |
| 504 | +#[inline(always)] |
| 505 | +#[target_feature = "+sse"] |
| 506 | +#[cfg_attr(test, assert_instr(movups))] |
| 507 | +pub unsafe fn _mm_loadu_ps(p: *const f32) -> f32x4 { |
| 508 | + // Note: Using `*p` would require `f32` alignment, but `movups` has no |
| 509 | + // alignment restrictions. |
| 510 | + let mut dst = f32x4::splat(mem::uninitialized()); |
| 511 | + ptr::copy_nonoverlapping( |
| 512 | + p as *const u8, |
| 513 | + &mut dst as *mut f32x4 as *mut u8, |
| 514 | + mem::size_of::<f32x4>()); |
| 515 | + dst |
| 516 | +} |
| 517 | + |
| 518 | +/// Load four `f32` values from aligned memory into a `f32x4` in reverse order. |
| 519 | +/// |
| 520 | +/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general |
| 521 | +/// protection fault will be triggered (fatal program crash). |
| 522 | +/// |
| 523 | +/// Functionally equivalent to the following code sequence (assuming `p` |
| 524 | +/// satisfies the alignment restrictions): |
| 525 | +/// |
| 526 | +/// ```text |
| 527 | +/// let a0 = *p; |
| 528 | +/// let a1 = *p.offset(1); |
| 529 | +/// let a2 = *p.offset(2); |
| 530 | +/// let a3 = *p.offset(3); |
| 531 | +/// f32x4::new(a3, a2, a1, a0) |
| 532 | +/// ``` |
| 533 | +/// |
| 534 | +/// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some |
| 535 | +/// shuffling. |
| 536 | +#[inline(always)] |
| 537 | +#[target_feature = "+sse"] |
| 538 | +#[cfg_attr(test, assert_instr(movaps))] |
| 539 | +pub unsafe fn _mm_loadr_ps(p: *const f32) -> f32x4 { |
| 540 | + let a = _mm_load_ps(p); |
| 541 | + simd_shuffle4(a, a, [3, 2, 1, 0]) |
| 542 | +} |
| 543 | + |
346 | 544 | /// Perform a serializing operation on all store-to-memory instructions that
|
347 | 545 | /// were issued prior to this instruction.
|
348 | 546 | ///
|
@@ -938,6 +1136,88 @@ mod tests {
|
938 | 1136 | assert_eq!(r, f32x4::new(1.0, 2.0, 5.0, 6.0));
|
939 | 1137 | }
|
940 | 1138 |
|
| 1139 | + #[simd_test = "sse"] |
| 1140 | + unsafe fn _mm_loadh_pi() { |
| 1141 | + let a = f32x4::new(1.0, 2.0, 3.0, 4.0); |
| 1142 | + let x: [f32; 4] = [5.0, 6.0, 7.0, 8.0]; |
| 1143 | + let p = x[..].as_ptr(); |
| 1144 | + let r = sse::_mm_loadh_pi(a, p); |
| 1145 | + assert_eq!(r, f32x4::new(1.0, 2.0, 5.0, 6.0)); |
| 1146 | + } |
| 1147 | + |
| 1148 | + #[simd_test = "sse"] |
| 1149 | + unsafe fn _mm_loadl_pi() { |
| 1150 | + let a = f32x4::new(1.0, 2.0, 3.0, 4.0); |
| 1151 | + let x: [f32; 4] = [5.0, 6.0, 7.0, 8.0]; |
| 1152 | + let p = x[..].as_ptr(); |
| 1153 | + let r = sse::_mm_loadl_pi(a, p); |
| 1154 | + assert_eq!(r, f32x4::new(5.0, 6.0, 3.0, 4.0)); |
| 1155 | + } |
| 1156 | + |
| 1157 | + #[simd_test = "sse"] |
| 1158 | + unsafe fn _mm_load_ss() { |
| 1159 | + let a = 42.0f32; |
| 1160 | + let r = sse::_mm_load_ss(&a as *const f32); |
| 1161 | + assert_eq!(r, f32x4::new(42.0, 0.0, 0.0, 0.0)); |
| 1162 | + } |
| 1163 | + |
| 1164 | + #[simd_test = "sse"] |
| 1165 | + unsafe fn _mm_load1_ps() { |
| 1166 | + let a = 42.0f32; |
| 1167 | + let r = sse::_mm_load1_ps(&a as *const f32); |
| 1168 | + assert_eq!(r, f32x4::new(42.0, 42.0, 42.0, 42.0)); |
| 1169 | + } |
| 1170 | + |
| 1171 | + #[simd_test = "sse"] |
| 1172 | + unsafe fn _mm_load_ps() { |
| 1173 | + let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; |
| 1174 | + |
| 1175 | + let mut p = vals.as_ptr(); |
| 1176 | + let mut fixup = 0.0f32; |
| 1177 | + |
| 1178 | + // Make sure p is aligned, otherwise we might get a |
| 1179 | + // (signal: 11, SIGSEGV: invalid memory reference) |
| 1180 | + |
| 1181 | + let unalignment = (p as usize) & 0xf; |
| 1182 | + if unalignment != 0 { |
| 1183 | + let delta = ((16 - unalignment) >> 2) as isize; |
| 1184 | + fixup = delta as f32; |
| 1185 | + p = p.offset(delta); |
| 1186 | + } |
| 1187 | + |
| 1188 | + let r = sse::_mm_load_ps(p); |
| 1189 | + assert_eq!(r, f32x4::new(1.0, 2.0, 3.0, 4.0) + f32x4::splat(fixup)); |
| 1190 | + } |
| 1191 | + |
| 1192 | + #[simd_test = "sse"] |
| 1193 | + unsafe fn _mm_loadu_ps() { |
| 1194 | + let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; |
| 1195 | + let p = vals.as_ptr().offset(3); |
| 1196 | + let r = sse::_mm_loadu_ps(black_box(p)); |
| 1197 | + assert_eq!(r, f32x4::new(4.0, 5.0, 6.0, 7.0)); |
| 1198 | + } |
| 1199 | + |
| 1200 | + #[simd_test = "sse"] |
| 1201 | + unsafe fn _mm_loadr_ps() { |
| 1202 | + let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; |
| 1203 | + |
| 1204 | + let mut p = vals.as_ptr(); |
| 1205 | + let mut fixup = 0.0f32; |
| 1206 | + |
| 1207 | + // Make sure p is aligned, otherwise we might get a |
| 1208 | + // (signal: 11, SIGSEGV: invalid memory reference) |
| 1209 | + |
| 1210 | + let unalignment = (p as usize) & 0xf; |
| 1211 | + if unalignment != 0 { |
| 1212 | + let delta = ((16 - unalignment) >> 2) as isize; |
| 1213 | + fixup = delta as f32; |
| 1214 | + p = p.offset(delta); |
| 1215 | + } |
| 1216 | + |
| 1217 | + let r = sse::_mm_loadr_ps(p); |
| 1218 | + assert_eq!(r, f32x4::new(4.0, 3.0, 2.0, 1.0) + f32x4::splat(fixup)); |
| 1219 | + } |
| 1220 | + |
941 | 1221 | #[simd_test = "sse"]
|
942 | 1222 | unsafe fn _mm_movemask_ps() {
|
943 | 1223 | let r = sse::_mm_movemask_ps(f32x4::new(-1.0, 5.0, -5.0, 0.0));
|
|
0 commit comments