Skip to content

Commit

Permalink
Implement _mm_getcsr, _mm_setcsr, _mm_sfence (rust-lang#88)
Browse files Browse the repository at this point in the history
* Add _mm_sfence

* Add _mm_getcsr/_mm_setcsr and convenience wrappers

* Use test::black_box to simplify tests

* Use uppercase naming for C-macro equivalents

Discussed at rust-lang/stdarch#84
  • Loading branch information
nominolo authored and alexcrichton committed Oct 5, 2017
1 parent 3a19de8 commit 6d2730a
Show file tree
Hide file tree
Showing 2 changed files with 322 additions and 1 deletion.
5 changes: 4 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,11 +119,14 @@
const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd, simd_ffi,
target_feature, cfg_target_feature, i128_type, asm, const_atomic_usize_new
)]
#![cfg_attr(test, feature(proc_macro))]
#![cfg_attr(test, feature(proc_macro, test))]

#[cfg(test)]
extern crate stdsimd_test;

#[cfg(test)]
extern crate test;

/// Platform independent SIMD vector types and operations.
pub mod simd {
pub use v128::*;
Expand Down
318 changes: 318 additions & 0 deletions src/x86/sse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,259 @@ pub unsafe fn _mm_movemask_ps(a: f32x4) -> i32 {
movmskps(a)
}

/// Perform a serializing operation on all store-to-memory instructions that
/// were issued prior to this instruction.
///
/// Guarantees that every store instruction that precedes, in program order, is
/// globally visible before any store instruction which follows the fence in
/// program order.
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(sfence))]
pub unsafe fn _mm_sfence() {
sfence()
}

/// Get the unsigned 32-bit value of the MXCSR control and status register.
///
/// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(stmxcsr))]
pub unsafe fn _mm_getcsr() -> u32 {
let mut result = 0i32;
stmxcsr((&mut result) as *mut _ as *mut i8);
result as u32
}

/// Set the MXCSR register with the 32-bit unsigned integer value.
///
/// This register constrols how SIMD instructions handle floating point
/// operations. Modifying this register only affects the current thread.
///
/// It contains several groups of flags:
///
/// * *Exception flags* report which exceptions occurred since last they were
/// reset.
///
/// * *Masking flags* can be used to mask (ignore) certain exceptions. By default
/// these flags are all set to 1, so all exceptions are masked. When an
/// an exception is masked, the processor simply sets the exception flag and
/// continues the operation. If the exception is unmasked, the flag is also set
/// but additionally an exception handler is invoked.
///
/// * *Rounding mode flags* control the rounding mode of floating point
/// instructions.
///
/// * The *denormals-are-zero mode flag* turns all numbers which would be
/// denormalized (exponent bits are all zeros) into zeros.
///
/// ## Exception Flags
///
/// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing
/// Infinity by Infinity).
///
/// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized
/// number. Mainly this can cause loss of precision.
///
/// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occured.
///
/// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occured, i.e., a
/// result was too large to be represented (e.g., an `f32` with absolute value
/// greater than `2^128`).
///
/// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occured, i.e., a
/// result was too small to be represented in a normalized way (e.g., an `f32`
/// with absulte value smaller than `2^-126`.)
///
/// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occured (a.k.a.
/// precision exception). This means some precision was lost due to rounding.
/// For example, the fraction `1/3` cannot be represented accurately in a
/// 32 or 64 bit float and computing it would cause this exception to be
/// raised. Precision exceptions are very common, so they are usually masked.
///
/// Exception flags can be read and set using the convenience functions
/// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to
/// check if an operation caused some overflow:
///
/// ```rust,ignore
/// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags
/// // perform calculations
/// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 {
/// // handle overflow
/// }
/// ```
///
/// ## Masking Flags
///
/// There is one masking flag for each exception flag: `_MM_MASK_INVALID`,
/// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`,
/// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
///
/// A single masking bit can be set via
///
/// ```rust,ignore
/// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW);
/// ```
///
/// However, since mask bits are by default all set to 1, it is more common to
/// want to *disable* certain bits. For example, to unmask the underflow
/// exception, use:
///
/// ```rust,ignore
/// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow exception
/// ```
///
/// Warning: an unmasked exception will cause an exception handler to be called.
/// The standard handler will simply terminate the process. So, in this case
/// any underflow exception would terminate the current process with something
/// like `signal: 8, SIGFPE: erroneous arithmetic operation`.
///
/// ## Rounding Mode
///
/// The rounding mode is describe using two bits. It can be read and set using
/// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and
/// `_MM_SET_ROUNDING_MODE(mode)`.
///
/// The rounding modes are:
///
/// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision
/// value. If two values are equally close, round to even (i.e., least
/// significant bit will be zero).
///
/// * `_MM_ROUND_DOWN`: Round toward negative Infinity.
///
/// * `_MM_ROUND_UP`: Round toward positive Infinity.
///
/// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate).
///
/// Example:
///
/// ```rust,ignore
/// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN)
/// ```
///
/// ## Denormals-are-zero/Flush-to-zero Mode
///
/// If this bit is set, values that would be denormalized will be set to zero
/// instead. This is turned off by default.
///
/// You can read and enable/disable this mode via the helper functions
/// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`:
///
/// ```rust,ignore
/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default)
/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on
/// ```
///
#[inline(always)]
#[target_feature = "+sse"]
#[cfg_attr(test, assert_instr(ldmxcsr))]
pub unsafe fn _mm_setcsr(val: u32) {
ldmxcsr(&val as *const _ as *const i8);
}

/// See [`_mm_setcsr`](fn._mm_setcsr.html)
pub const _MM_EXCEPT_INVALID: u32 = 0x0001;
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
pub const _MM_EXCEPT_DENORM: u32 = 0x0002;
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004;
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008;
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010;
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
pub const _MM_EXCEPT_INEXACT: u32 = 0x0020;
pub const _MM_EXCEPT_MASK: u32 = 0x003f;

/// See [`_mm_setcsr`](fn._mm_setcsr.html)
pub const _MM_MASK_INVALID: u32 = 0x0080;
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
pub const _MM_MASK_DENORM: u32 = 0x0100;
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
pub const _MM_MASK_DIV_ZERO: u32 = 0x0200;
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
pub const _MM_MASK_OVERFLOW: u32 = 0x0400;
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
pub const _MM_MASK_UNDERFLOW: u32 = 0x0800;
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
pub const _MM_MASK_INEXACT: u32 = 0x1000;
pub const _MM_MASK_MASK: u32 = 0x1f80;

/// See [`_mm_setcsr`](fn._mm_setcsr.html)
pub const _MM_ROUND_NEAREST: u32 = 0x0000;
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
pub const _MM_ROUND_DOWN: u32 = 0x2000;
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
pub const _MM_ROUND_UP: u32 = 0x4000;
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000;
pub const _MM_ROUND_MASK: u32 = 0x6000;

pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000;
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000;
/// See [`_mm_setcsr`](fn._mm_setcsr.html)
pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;

#[inline(always)]
#[allow(non_snake_case)]
#[target_feature = "+sse"]
pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
_mm_getcsr() & _MM_MASK_MASK
}

#[inline(always)]
#[allow(non_snake_case)]
#[target_feature = "+sse"]
pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
_mm_getcsr() & _MM_EXCEPT_MASK
}

#[inline(always)]
#[allow(non_snake_case)]
#[target_feature = "+sse"]
pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
_mm_getcsr() & _MM_FLUSH_ZERO_MASK
}

#[inline(always)]
#[allow(non_snake_case)]
#[target_feature = "+sse"]
pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
_mm_getcsr() & _MM_ROUND_MASK
}

#[inline(always)]
#[allow(non_snake_case)]
#[target_feature = "+sse"]
pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
_mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | x)
}

#[inline(always)]
#[allow(non_snake_case)]
#[target_feature = "+sse"]
pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
_mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | x)
}

#[inline(always)]
#[allow(non_snake_case)]
#[target_feature = "+sse"]
pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
let val = (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | x;
//println!("setting csr={:x}", val);
_mm_setcsr(val)
}

#[inline(always)]
#[allow(non_snake_case)]
#[target_feature = "+sse"]
pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
_mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | x)
}

/// See [`_mm_prefetch`](fn._mm_prefetch.html).
pub const _MM_HINT_T0: i8 = 3;
Expand Down Expand Up @@ -374,6 +627,12 @@ extern {
fn maxps(a: f32x4, b: f32x4) -> f32x4;
#[link_name = "llvm.x86.sse.movmsk.ps"]
fn movmskps(a: f32x4) -> i32;
#[link_name = "llvm.x86.sse.sfence"]
fn sfence();
#[link_name = "llvm.x86.sse.stmxcsr"]
fn stmxcsr(p: *mut i8);
#[link_name = "llvm.x86.sse.ldmxcsr"]
fn ldmxcsr(p: *const i8);
#[link_name = "llvm.prefetch"]
fn prefetch(p: *const c_void, rw: i32, loc: i32, ty: i32);
}
Expand All @@ -383,6 +642,7 @@ mod tests {
use v128::*;
use x86::sse;
use stdsimd_test::simd_test;
use test::black_box; // Used to inhibit constant-folding.

#[simd_test = "sse"]
unsafe fn _mm_add_ps() {
Expand Down Expand Up @@ -577,4 +837,62 @@ mod tests {
let r = sse::_mm_movemask_ps(f32x4::new(-1.0, -5.0, -5.0, 0.0));
assert_eq!(r, 0b0111);
}

#[simd_test = "sse"]
unsafe fn _mm_sfence() {
sse::_mm_sfence();
}

#[simd_test = "sse"]
unsafe fn _mm_getcsr_setcsr_1() {
let saved_csr = sse::_mm_getcsr();

let a = f32x4::new(1.1e-36, 0.0, 0.0, 1.0);
let b = f32x4::new(0.001, 0.0, 0.0, 1.0);

sse::_MM_SET_FLUSH_ZERO_MODE(sse::_MM_FLUSH_ZERO_ON);
let r = sse::_mm_mul_ps(black_box(a), black_box(b));

sse::_mm_setcsr(saved_csr);

let exp = f32x4::new(0.0, 0.0, 0.0, 1.0);
assert_eq!(r, exp); // first component is a denormalized f32
}

#[simd_test = "sse"]
unsafe fn _mm_getcsr_setcsr_2() {
// Same as _mm_setcsr_1 test, but with opposite flag value.

let saved_csr = sse::_mm_getcsr();

let a = f32x4::new(1.1e-36, 0.0, 0.0, 1.0);
let b = f32x4::new(0.001, 0.0, 0.0, 1.0);

sse::_MM_SET_FLUSH_ZERO_MODE(sse::_MM_FLUSH_ZERO_OFF);
let r = sse::_mm_mul_ps(black_box(a), black_box(b));

sse::_mm_setcsr(saved_csr);

let exp = f32x4::new(1.1e-39, 0.0, 0.0, 1.0);
assert_eq!(r, exp); // first component is a denormalized f32
}

#[simd_test = "sse"]
unsafe fn _mm_getcsr_setcsr_underflow() {
sse::_MM_SET_EXCEPTION_STATE(0);

let a = f32x4::new(1.1e-36, 0.0, 0.0, 1.0);
let b = f32x4::new(1e-5, 0.0, 0.0, 1.0);

assert_eq!(sse::_MM_GET_EXCEPTION_STATE(), 0); // just to be sure

let r = sse::_mm_mul_ps(black_box(a), black_box(b));

let exp = f32x4::new(1.1e-41, 0.0, 0.0, 1.0);
assert_eq!(r, exp);

let underflow =
sse::_MM_GET_EXCEPTION_STATE() & sse::_MM_EXCEPT_UNDERFLOW != 0;
assert_eq!(underflow, true);
}
}

0 comments on commit 6d2730a

Please sign in to comment.