@@ -1348,14 +1348,73 @@ pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
1348
1348
simd_shuffle ! ( a, b, [ 4 , 1 , 2 , 3 ] )
1349
1349
}
1350
1350
1351
- /// Performs a serializing operation on all store-to-memory instructions that
1352
- /// were issued prior to this instruction.
1351
+ /// Performs a serializing operation on all non-temporal ("streaming") store instructions that
1352
+ /// were issued by the current thread prior to this instruction.
1353
1353
///
1354
- /// Guarantees that every store instruction that precedes, in program order, is
1355
- /// globally visible before any store instruction which follows the fence in
1356
- /// program order.
1354
+ /// Guarantees that every non-temporal store instruction that precedes this fence , in program order, is
1355
+ /// ordered before any load or store instruction which follows the fence in
1356
+ /// synchronization order.
1357
1357
///
1358
1358
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence)
1359
+ /// (but note that Intel is only documenting the hardware-level concerns related to this
1360
+ /// instruction; the Intel documentation does not take into account the extra concerns that arise
1361
+ /// because the Rust memory model is different from the x86 memory model.)
1362
+ ///
1363
+ /// # Safety of non-temporal stores
1364
+ ///
1365
+ /// After using any non-temporal store intrinsic, but before any other access to the memory that the
1366
+ /// intrinsic mutates, a call to `_mm_sfence` must be performed on the thread that used the
1367
+ /// intrinsic.
1368
+ ///
1369
+ /// Non-temporal stores behave very different from regular stores. For the purpose of the Rust
1370
+ /// memory model, these stores are happening asynchronously in a background thread. This means a
1371
+ /// non-temporal store can cause data races with other accesses, even other accesses on the same
1372
+ /// thread. It also means that cross-thread synchronization does not work as expected: let's say the
1373
+ /// intrinsic is called on thread T1, and T1 performs synchronization with some other thread T2. The
1374
+ /// non-temporal store acts as if it happened not in T1 but in a different thread T3, and T2 has not
1375
+ /// synchronized with T3! Calling `_mm_sfence` makes the current thread wait for and synchronize
1376
+ /// with all the non-temporal stores previously started on this thread, which means in particular
1377
+ /// that subsequent synchronization with other threads will then work as intended again.
1378
+ ///
1379
+ /// The general pattern to use non-temporal stores correctly is to call `_mm_sfence` before your
1380
+ /// code jumps back to code outside your library. This ensures all stores inside your function
1381
+ /// are synchronized-before the return, and thus transitively synchronized-before everything
1382
+ /// the caller does after your function returns.
1383
+ //
1384
+ // The following is not a doc comment since it's not clear whether we want to put this into the
1385
+ // docs, but it should be written out somewhere.
1386
+ //
1387
+ // Formally, we consider non-temporal stores and sfences to be opaque blobs that the compiler cannot
1388
+ // inspect, and that behave like the following functions. This explains where the docs above come
1389
+ // from.
1390
+ // ```
1391
+ // #[thread_local]
1392
+ // static mut PENDING_NONTEMP_WRITES = AtomicUsize::new(0);
1393
+ //
1394
+ // pub unsafe fn nontemporal_store<T>(ptr: *mut T, val: T) {
1395
+ // PENDING_NONTEMP_WRITES.fetch_add(1, Relaxed);
1396
+ // // Spawn a thread that will eventually do our write.
1397
+ // // We need to fetch a pointer to this thread's pending-write
1398
+ // // counter, so that we can access it from the background thread.
1399
+ // let pending_writes = addr_of!(PENDING_NONTEMP_WRITES);
1400
+ // // If this was actual Rust code we'd have to do some extra work
1401
+ // // because `ptr`, `val`, `pending_writes` are all `!Send`. We skip that here.
1402
+ // std::thread::spawn(move || {
1403
+ // // Do the write in the background thread.
1404
+ // ptr.write(val);
1405
+ // // Register the write as done. Crucially, this is `Release`, so it
1406
+ // // syncs-with the `Acquire in `sfence`.
1407
+ // (&*pending_writes).fetch_sub(1, Release);
1408
+ // });
1409
+ // }
1410
+ //
1411
+ // pub fn sfence() {
1412
+ // unsafe {
1413
+ // // Wait until there are no more pending writes.
1414
+ // while PENDING_NONTEMP_WRITES.load(Acquire) > 0 {}
1415
+ // }
1416
+ // }
1417
+ // ```
1359
1418
#[ inline]
1360
1419
#[ target_feature( enable = "sse" ) ]
1361
1420
#[ cfg_attr( test, assert_instr( sfence) ) ]
@@ -1938,6 +1997,15 @@ extern "C" {
1938
1997
/// exception _may_ be generated.
1939
1998
///
1940
1999
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps)
2000
+ ///
2001
+ /// # Safety of non-temporal stores
2002
+ ///
2003
+ /// After using this intrinsic, but before any other access to the memory that this intrinsic
2004
+ /// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
2005
+ /// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
2006
+ /// return.
2007
+ ///
2008
+ /// See [`_mm_sfence`] for details.
1941
2009
#[ inline]
1942
2010
#[ target_feature( enable = "sse" ) ]
1943
2011
#[ cfg_attr( test, assert_instr( movntps) ) ]
0 commit comments