make SC fences stronger, to be correct wrt C++20

rust-lang · Nov 24, 2024 · fd0945a · fd0945a
1 parent 8fc1776
commit fd0945a
Show file tree

Hide file tree

Showing 6 changed files with 68 additions and 140 deletions.
diff --git a/README.md b/README.md
@@ -68,9 +68,10 @@ Further caveats that Miri users should be aware of:
   not support networking. System API support varies between targets; if you run
   on Windows it is a good idea to use `--target x86_64-unknown-linux-gnu` to get
   better support.
-* Weak memory emulation may [produce weak behaviors](https://github.com/rust-lang/miri/issues/2301)
-  when `SeqCst` fences are used that are not actually permitted by the Rust memory model, and it
-  cannot produce all behaviors possibly observable on real hardware.
+* Weak memory emulation is not complete: there are legal behaviors that Miri will never produce.
+  However, Miri produces many behaviors that are hard to observe on real hardware, so it can help
+  quite a bit in finding weak memory concurrency bugs. To be really sure about complicated atomic
+  code, use specialized tools such as [loom](https://github.com/tokio-rs/loom).
 
 Moreover, Miri fundamentally cannot ensure that your code is *sound*. [Soundness] is the property of
 never causing undefined behavior when invoked from arbitrary safe code, even in combination with

diff --git a/src/concurrency/data_race.rs b/src/concurrency/data_race.rs
@@ -111,12 +111,6 @@ pub(super) struct ThreadClockSet {
     /// have been released by this thread by a release fence.
     fence_release: VClock,
 
-    /// Timestamps of the last SC fence performed by each
-    /// thread, updated when this thread performs an SC fence.
-    /// This is never acquired into the thread's clock, it
-    /// just limits which old writes can be seen in weak memory emulation.
-    pub(super) fence_seqcst: VClock,
-
     /// Timestamps of the last SC write performed by each
     /// thread, updated when this thread performs an SC fence.
     /// This is never acquired into the thread's clock, it
@@ -344,11 +338,13 @@ pub struct GlobalState {
     /// active vector-clocks catch up with the threads timestamp.
     reuse_candidates: RefCell<FxHashSet<VectorIdx>>,
 
-    /// The timestamp of last SC fence performed by each thread
+    /// We make SC fences act like RMWs on a global location.
+    /// To implement that, they all release and acquire into this clock.
     last_sc_fence: RefCell<VClock>,
 
-    /// The timestamp of last SC write performed by each thread
-    last_sc_write: RefCell<VClock>,
+    /// The timestamp of last SC write performed by each thread.
+    /// Threads only update their own index here!
+    last_sc_write_per_thread: RefCell<VClock>,
 
     /// Track when an outdated (weak memory) load happens.
     pub track_outdated_loads: bool,
@@ -883,9 +879,17 @@ pub trait EvalContextExt<'tcx>: MiriInterpCxExt<'tcx> {
                         clocks.apply_release_fence();
                     }
                     if atomic == AtomicFenceOrd::SeqCst {
-                        data_race.last_sc_fence.borrow_mut().set_at_index(&clocks.clock, index);
-                        clocks.fence_seqcst.join(&data_race.last_sc_fence.borrow());
-                        clocks.write_seqcst.join(&data_race.last_sc_write.borrow());
+                        // Behave like an RMW on the global fence location. This takes full care of
+                        // all the SC fence requirements, including C++17 §32.4 [atomics.order]
+                        // paragraph 6 (which would limit what future reads can see). It also rules
+                        // out many legal behaviors, but we don't currently have a model that would
+                        // be more precise.
+                        let mut sc_fence_clock = data_race.last_sc_fence.borrow_mut();
+                        sc_fence_clock.join(&clocks.clock);
+                        clocks.clock.join(&sc_fence_clock);
+                        // Also establish some sort of order with the last SC write that happened, globally
+                        // (but this is only respected by future reads).
+                        clocks.write_seqcst.join(&data_race.last_sc_write_per_thread.borrow());
                     }
 
                     // Increment timestamp in case of release semantics.
@@ -1555,7 +1559,7 @@ impl GlobalState {
             thread_info: RefCell::new(IndexVec::new()),
             reuse_candidates: RefCell::new(FxHashSet::default()),
             last_sc_fence: RefCell::new(VClock::default()),
-            last_sc_write: RefCell::new(VClock::default()),
+            last_sc_write_per_thread: RefCell::new(VClock::default()),
             track_outdated_loads: config.track_outdated_loads,
         };
 
@@ -1857,7 +1861,7 @@ impl GlobalState {
     // SC ATOMIC STORE rule in the paper.
     pub(super) fn sc_write(&self, thread_mgr: &ThreadManager<'_>) {
         let (index, clocks) = self.active_thread_state(thread_mgr);
-        self.last_sc_write.borrow_mut().set_at_index(&clocks.clock, index);
+        self.last_sc_write_per_thread.borrow_mut().set_at_index(&clocks.clock, index);
     }
 
     // SC ATOMIC READ rule in the paper.

diff --git a/src/concurrency/weak_memory.rs b/src/concurrency/weak_memory.rs
@@ -11,14 +11,17 @@
 //! This implementation is not fully correct under the revised C++20 model and may generate behaviours C++20
 //! disallows (<https://github.com/rust-lang/miri/issues/2301>).
 //!
-//! A modification is made to the paper's model to partially address C++20 changes.
-//! Specifically, if an SC load reads from an atomic store of any ordering, then a later SC load cannot read from
-//! an earlier store in the location's modification order. This is to prevent creating a backwards S edge from the second
-//! load to the first, as a result of C++20's coherence-ordered before rules.
-//! (This seems to rule out behaviors that were actually permitted by the RC11 model that C++20
-//! intended to copy (https://plv.mpi-sws.org/scfix/paper.pdf); a change was introduced when
-//! translating the math to English. According to Viktor Vafeiadis, this difference is harmless. So
-//! we stick to what the standard says, and allow fewer behaviors.)
+//! Modifications are made to the paper's model to address C++20 changes:
+//! - If an SC load reads from an atomic store of any ordering, then a later SC load cannot read
+//!   from an earlier store in the location's modification order. This is to prevent creating a
+//!   backwards S edge from the second load to the first, as a result of C++20's coherence-ordered
+//!   before rules. (This seems to rule out behaviors that were actually permitted by the RC11 model
+//!   that C++20 intended to copy (https://plv.mpi-sws.org/scfix/paper.pdf); a change was introduced
+//!   when translating the math to English. According to Viktor Vafeiadis, this difference is
+//!   harmless. So we stick to what the standard says, and allow fewer behaviors.)
+//! - SC fences are treated like RMWs to a global clock, to ensure they induce enough
+//!   synchronization with the surrounding accesses. This rules out legal behavior, but it is really
+//!   hard to be more precise here.
 //!
 //! Rust follows the C++20 memory model (except for the Consume ordering and some operations not performable through C++'s
 //! `std::atomic<T>` API). It is therefore possible for this implementation to generate behaviours never observable when the
@@ -340,11 +343,6 @@ impl<'tcx> StoreBuffer {
                     // then we cannot read-from anything earlier in modification order.
                     // C++20 §6.9.2.2 [intro.races] paragraph 16
                     false
-                } else if store_elem.timestamp <= clocks.fence_seqcst[store_elem.store_index] {
-                    // The current load, which may be sequenced-after an SC fence, cannot read-before
-                    // the last store sequenced-before an SC fence in another thread.
-                    // C++17 §32.4 [atomics.order] paragraph 6
-                    false
                 } else if store_elem.timestamp <= clocks.write_seqcst[store_elem.store_index]
                     && store_elem.is_seqcst
                 {

diff --git a/tests/fail/should-pass/cpp20_rwc_syncs.rs b/tests/fail/should-pass/cpp20_rwc_syncs.rs
diff --git a/tests/fail/should-pass/cpp20_rwc_syncs.stderr b/tests/fail/should-pass/cpp20_rwc_syncs.stderr
diff --git a/tests/pass/0weak_memory_consistency.rs b/tests/pass/0weak_memory_consistency.rs
@@ -268,9 +268,6 @@ fn test_iriw_sc_rlx() {
     let x = static_atomic_bool(false);
     let y = static_atomic_bool(false);
 
-    x.store(false, Relaxed);
-    y.store(false, Relaxed);
-
     let a = spawn(move || x.store(true, Relaxed));
     let b = spawn(move || y.store(true, Relaxed));
     let c = spawn(move || {
@@ -290,6 +287,40 @@ fn test_iriw_sc_rlx() {
     assert!(c || d);
 }
 
+// Similar to `test_iriw_sc_rlx` but with fences instead of SC accesses.
+fn test_cpp20_sc_fence_fix() {
+    let x = static_atomic_bool(false);
+    let y = static_atomic_bool(false);
+
+    let thread1 = spawn(|| {
+        let a = x.load(Relaxed);
+        fence(SeqCst);
+        let b = y.load(Relaxed);
+        (a, b)
+    });
+
+    let thread2 = spawn(|| {
+        x.store(true, Relaxed);
+    });
+    let thread3 = spawn(|| {
+        y.store(true, Relaxed);
+    });
+
+    let thread4 = spawn(|| {
+        let c = y.load(Relaxed);
+        fence(SeqCst);
+        let d = x.load(Relaxed);
+        (c, d)
+    });
+
+    let (a, b) = thread1.join().unwrap();
+    thread2.join().unwrap();
+    thread3.join().unwrap();
+    let (c, d) = thread4.join().unwrap();
+    let bad = a == true && b == false && c == true && d == false;
+    assert!(!bad);
+}
+
 pub fn main() {
     for _ in 0..50 {
         test_single_thread();
@@ -301,5 +332,6 @@ pub fn main() {
         test_sc_store_buffering();
         test_sync_through_rmw_and_fences();
         test_iriw_sc_rlx();
+        test_cpp20_sc_fence_fix();
     }
 }