Skip to content

Commit c8ae80e

Browse files
RalfJungAmanieu
authored andcommitted
non-temporal stores: document interaction with Rust memory model
1 parent 0df7764 commit c8ae80e

File tree

6 files changed

+181
-5
lines changed

6 files changed

+181
-5
lines changed

crates/core_arch/src/x86/avx.rs

+27
Original file line numberDiff line numberDiff line change
@@ -1692,6 +1692,15 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
16921692
/// non-temporal (unlikely to be used again soon)
16931693
///
16941694
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_si256)
1695+
///
1696+
/// # Safety of non-temporal stores
1697+
///
1698+
/// After using this intrinsic, but before any other access to the memory that this intrinsic
1699+
/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1700+
/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1701+
/// return.
1702+
///
1703+
/// See [`_mm_sfence`] for details.
16951704
#[inline]
16961705
#[target_feature(enable = "avx")]
16971706
#[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntdq
@@ -1705,6 +1714,15 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
17051714
/// flagged as non-temporal (unlikely to be used again soon).
17061715
///
17071716
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_pd)
1717+
///
1718+
/// # Safety of non-temporal stores
1719+
///
1720+
/// After using this intrinsic, but before any other access to the memory that this intrinsic
1721+
/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1722+
/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1723+
/// return.
1724+
///
1725+
/// See [`_mm_sfence`] for details.
17081726
#[inline]
17091727
#[target_feature(enable = "avx")]
17101728
#[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntpd
@@ -1720,6 +1738,15 @@ pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
17201738
/// soon).
17211739
///
17221740
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_ps)
1741+
///
1742+
/// # Safety of non-temporal stores
1743+
///
1744+
/// After using this intrinsic, but before any other access to the memory that this intrinsic
1745+
/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1746+
/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1747+
/// return.
1748+
///
1749+
/// See [`_mm_sfence`] for details.
17231750
#[inline]
17241751
#[target_feature(enable = "avx")]
17251752
#[cfg_attr(test, assert_instr(vmovntps))]

crates/core_arch/src/x86/avx512f.rs

+27
Original file line numberDiff line numberDiff line change
@@ -27998,6 +27998,15 @@ pub unsafe fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) ->
2799827998
/// Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
2799927999
///
2800028000
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_stream_ps&expand=5671)
28001+
///
28002+
/// # Safety of non-temporal stores
28003+
///
28004+
/// After using this intrinsic, but before any other access to the memory that this intrinsic
28005+
/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
28006+
/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
28007+
/// return.
28008+
///
28009+
/// See [`_mm_sfence`] for details.
2800128010
#[inline]
2800228011
#[target_feature(enable = "avx512f")]
2800328012
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
@@ -28010,6 +28019,15 @@ pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
2801028019
/// Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
2801128020
///
2801228021
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_stream_pd&expand=5667)
28022+
///
28023+
/// # Safety of non-temporal stores
28024+
///
28025+
/// After using this intrinsic, but before any other access to the memory that this intrinsic
28026+
/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
28027+
/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
28028+
/// return.
28029+
///
28030+
/// See [`_mm_sfence`] for details.
2801328031
#[inline]
2801428032
#[target_feature(enable = "avx512f")]
2801528033
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
@@ -28022,6 +28040,15 @@ pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
2802228040
/// Store 512-bits of integer data from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
2802328041
///
2802428042
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_stream_si512&expand=5675)
28043+
///
28044+
/// # Safety of non-temporal stores
28045+
///
28046+
/// After using this intrinsic, but before any other access to the memory that this intrinsic
28047+
/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
28048+
/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
28049+
/// return.
28050+
///
28051+
/// See [`_mm_sfence`] for details.
2802528052
#[inline]
2802628053
#[target_feature(enable = "avx512f")]
2802728054
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]

crates/core_arch/src/x86/sse.rs

+73-5
Original file line numberDiff line numberDiff line change
@@ -1348,14 +1348,73 @@ pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
13481348
simd_shuffle!(a, b, [4, 1, 2, 3])
13491349
}
13501350

1351-
/// Performs a serializing operation on all store-to-memory instructions that
1352-
/// were issued prior to this instruction.
1351+
/// Performs a serializing operation on all non-temporal ("streaming") store instructions that
1352+
/// were issued by the current thread prior to this instruction.
13531353
///
1354-
/// Guarantees that every store instruction that precedes, in program order, is
1355-
/// globally visible before any store instruction which follows the fence in
1356-
/// program order.
1354+
/// Guarantees that every non-temporal store instruction that precedes this fence, in program order, is
1355+
/// ordered before any load or store instruction which follows the fence in
1356+
/// synchronization order.
13571357
///
13581358
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence)
1359+
/// (but note that Intel is only documenting the hardware-level concerns related to this
1360+
/// instruction; the Intel documentation does not take into account the extra concerns that arise
1361+
/// because the Rust memory model is different from the x86 memory model.)
1362+
///
1363+
/// # Safety of non-temporal stores
1364+
///
1365+
/// After using any non-temporal store intrinsic, but before any other access to the memory that the
1366+
/// intrinsic mutates, a call to `_mm_sfence` must be performed on the thread that used the
1367+
/// intrinsic.
1368+
///
1369+
/// Non-temporal stores behave very different from regular stores. For the purpose of the Rust
1370+
/// memory model, these stores are happening asynchronously in a background thread. This means a
1371+
/// non-temporal store can cause data races with other accesses, even other accesses on the same
1372+
/// thread. It also means that cross-thread synchronization does not work as expected: let's say the
1373+
/// intrinsic is called on thread T1, and T1 performs synchronization with some other thread T2. The
1374+
/// non-temporal store acts as if it happened not in T1 but in a different thread T3, and T2 has not
1375+
/// synchronized with T3! Calling `_mm_sfence` makes the current thread wait for and synchronize
1376+
/// with all the non-temporal stores previously started on this thread, which means in particular
1377+
/// that subsequent synchronization with other threads will then work as intended again.
1378+
///
1379+
/// The general pattern to use non-temporal stores correctly is to call `_mm_sfence` before your
1380+
/// code jumps back to code outside your library. This ensures all stores inside your function
1381+
/// are synchronized-before the return, and thus transitively synchronized-before everything
1382+
/// the caller does after your function returns.
1383+
//
1384+
// The following is not a doc comment since it's not clear whether we want to put this into the
1385+
// docs, but it should be written out somewhere.
1386+
//
1387+
// Formally, we consider non-temporal stores and sfences to be opaque blobs that the compiler cannot
1388+
// inspect, and that behave like the following functions. This explains where the docs above come
1389+
// from.
1390+
// ```
1391+
// #[thread_local]
1392+
// static mut PENDING_NONTEMP_WRITES = AtomicUsize::new(0);
1393+
//
1394+
// pub unsafe fn nontemporal_store<T>(ptr: *mut T, val: T) {
1395+
// PENDING_NONTEMP_WRITES.fetch_add(1, Relaxed);
1396+
// // Spawn a thread that will eventually do our write.
1397+
// // We need to fetch a pointer to this thread's pending-write
1398+
// // counter, so that we can access it from the background thread.
1399+
// let pending_writes = addr_of!(PENDING_NONTEMP_WRITES);
1400+
// // If this was actual Rust code we'd have to do some extra work
1401+
// // because `ptr`, `val`, `pending_writes` are all `!Send`. We skip that here.
1402+
// std::thread::spawn(move || {
1403+
// // Do the write in the background thread.
1404+
// ptr.write(val);
1405+
// // Register the write as done. Crucially, this is `Release`, so it
1406+
// // syncs-with the `Acquire in `sfence`.
1407+
// (&*pending_writes).fetch_sub(1, Release);
1408+
// });
1409+
// }
1410+
//
1411+
// pub fn sfence() {
1412+
// unsafe {
1413+
// // Wait until there are no more pending writes.
1414+
// while PENDING_NONTEMP_WRITES.load(Acquire) > 0 {}
1415+
// }
1416+
// }
1417+
// ```
13591418
#[inline]
13601419
#[target_feature(enable = "sse")]
13611420
#[cfg_attr(test, assert_instr(sfence))]
@@ -1938,6 +1997,15 @@ extern "C" {
19381997
/// exception _may_ be generated.
19391998
///
19401999
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps)
2000+
///
2001+
/// # Safety of non-temporal stores
2002+
///
2003+
/// After using this intrinsic, but before any other access to the memory that this intrinsic
2004+
/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
2005+
/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
2006+
/// return.
2007+
///
2008+
/// See [`_mm_sfence`] for details.
19412009
#[inline]
19422010
#[target_feature(enable = "sse")]
19432011
#[cfg_attr(test, assert_instr(movntps))]

crates/core_arch/src/x86/sse2.rs

+27
Original file line numberDiff line numberDiff line change
@@ -1315,6 +1315,15 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
13151315
/// used again soon).
13161316
///
13171317
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128)
1318+
///
1319+
/// # Safety of non-temporal stores
1320+
///
1321+
/// After using this intrinsic, but before any other access to the memory that this intrinsic
1322+
/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1323+
/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1324+
/// return.
1325+
///
1326+
/// See [`_mm_sfence`] for details.
13181327
#[inline]
13191328
#[target_feature(enable = "sse2")]
13201329
#[cfg_attr(test, assert_instr(movntps))] // FIXME movntdq
@@ -1328,6 +1337,15 @@ pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
13281337
/// used again soon).
13291338
///
13301339
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32)
1340+
///
1341+
/// # Safety of non-temporal stores
1342+
///
1343+
/// After using this intrinsic, but before any other access to the memory that this intrinsic
1344+
/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1345+
/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1346+
/// return.
1347+
///
1348+
/// See [`_mm_sfence`] for details.
13311349
#[inline]
13321350
#[target_feature(enable = "sse2")]
13331351
#[cfg_attr(test, assert_instr(movnti))]
@@ -2513,6 +2531,15 @@ pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
25132531
/// used again soon).
25142532
///
25152533
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd)
2534+
///
2535+
/// # Safety of non-temporal stores
2536+
///
2537+
/// After using this intrinsic, but before any other access to the memory that this intrinsic
2538+
/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
2539+
/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
2540+
/// return.
2541+
///
2542+
/// See [`_mm_sfence`] for details.
25162543
#[inline]
25172544
#[target_feature(enable = "sse2")]
25182545
#[cfg_attr(test, assert_instr(movntps))] // FIXME movntpd

crates/core_arch/src/x86/sse4a.rs

+18
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,15 @@ pub unsafe fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i {
5959
/// Non-temporal store of `a.0` into `p`.
6060
///
6161
/// Writes 64-bit data to a memory location without polluting the caches.
62+
///
63+
/// # Safety of non-temporal stores
64+
///
65+
/// After using this intrinsic, but before any other access to the memory that this intrinsic
66+
/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
67+
/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
68+
/// return.
69+
///
70+
/// See [`_mm_sfence`] for details.
6271
#[inline]
6372
#[target_feature(enable = "sse4a")]
6473
#[cfg_attr(test, assert_instr(movntsd))]
@@ -70,6 +79,15 @@ pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
7079
/// Non-temporal store of `a.0` into `p`.
7180
///
7281
/// Writes 32-bit data to a memory location without polluting the caches.
82+
///
83+
/// # Safety of non-temporal stores
84+
///
85+
/// After using this intrinsic, but before any other access to the memory that this intrinsic
86+
/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
87+
/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
88+
/// return.
89+
///
90+
/// See [`_mm_sfence`] for details.
7391
#[inline]
7492
#[target_feature(enable = "sse4a")]
7593
#[cfg_attr(test, assert_instr(movntss))]

crates/core_arch/src/x86_64/sse2.rs

+9
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,15 @@ pub unsafe fn _mm_cvttsd_si64x(a: __m128d) -> i64 {
6767
/// used again soon).
6868
///
6969
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64)
70+
///
71+
/// # Safety of non-temporal stores
72+
///
73+
/// After using this intrinsic, but before any other access to the memory that this intrinsic
74+
/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
75+
/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
76+
/// return.
77+
///
78+
/// See [`_mm_sfence`] for details.
7079
#[inline]
7180
#[target_feature(enable = "sse2")]
7281
#[cfg_attr(test, assert_instr(movnti))]

0 commit comments

Comments
 (0)