diff --git a/README.md b/README.md
index 9a683ae68f..4e30dea18f 100644
--- a/README.md
+++ b/README.md
@@ -68,9 +68,10 @@ Further caveats that Miri users should be aware of:
   not support networking. System API support varies between targets; if you run
   on Windows it is a good idea to use `--target x86_64-unknown-linux-gnu` to get
   better support.
-* Weak memory emulation may [produce weak behaviors](https://github.com/rust-lang/miri/issues/2301)
-  when `SeqCst` fences are used that are not actually permitted by the Rust memory model, and it
-  cannot produce all behaviors possibly observable on real hardware.
+* Weak memory emulation is not complete: there are legal behaviors that Miri will never produce.
+  However, Miri produces many behaviors that are hard to observe on real hardware, so it can help
+  quite a bit in finding weak memory concurrency bugs. To be really sure about complicated atomic
+  code, use specialized tools such as [loom](https://github.com/tokio-rs/loom).
 
 Moreover, Miri fundamentally cannot ensure that your code is *sound*. [Soundness] is the property of
 never causing undefined behavior when invoked from arbitrary safe code, even in combination with
diff --git a/src/concurrency/data_race.rs b/src/concurrency/data_race.rs
index f86d1eb1dc..5cb68634b7 100644
--- a/src/concurrency/data_race.rs
+++ b/src/concurrency/data_race.rs
@@ -108,19 +108,19 @@ pub(super) struct ThreadClockSet {
     fence_acquire: VClock,
 
     /// The last timestamp of happens-before relations that
-    /// have been released by this thread by a fence.
+    /// have been released by this thread by a release fence.
     fence_release: VClock,
 
-    /// Timestamps of the last SC fence performed by each
-    /// thread, updated when this thread performs an SC fence
-    pub(super) fence_seqcst: VClock,
-
     /// Timestamps of the last SC write performed by each
-    /// thread, updated when this thread performs an SC fence
+    /// thread, updated when this thread performs an SC fence.
+    /// This is never acquired into the thread's clock, it
+    /// just limits which old writes can be seen in weak memory emulation.
     pub(super) write_seqcst: VClock,
 
     /// Timestamps of the last SC fence performed by each
-    /// thread, updated when this thread performs an SC read
+    /// thread, updated when this thread performs an SC read.
+    /// This is never acquired into the thread's clock, it
+    /// just limits which old writes can be seen in weak memory emulation.
     pub(super) read_seqcst: VClock,
 }
 
@@ -256,6 +256,106 @@ enum AccessType {
     AtomicRmw,
 }
 
+/// Per-byte vector clock metadata for data-race detection.
+#[derive(Clone, PartialEq, Eq, Debug)]
+struct MemoryCellClocks {
+    /// The vector-clock timestamp and the thread that did the last non-atomic write. We don't need
+    /// a full `VClock` here, it's always a single thread and nothing synchronizes, so the effective
+    /// clock is all-0 except for the thread that did the write.
+    write: (VectorIdx, VTimestamp),
+
+    /// The type of operation that the write index represents,
+    /// either newly allocated memory, a non-atomic write or
+    /// a deallocation of memory.
+    write_type: NaWriteType,
+
+    /// The vector-clock of all non-atomic reads that happened since the last non-atomic write
+    /// (i.e., we join together the "singleton" clocks corresponding to each read). It is reset to
+    /// zero on each write operation.
+    read: VClock,
+
+    /// Atomic access, acquire, release sequence tracking clocks.
+    /// For non-atomic memory this value is set to None.
+    /// For atomic memory, each byte carries this information.
+    atomic_ops: Option<Box<AtomicMemoryCellClocks>>,
+}
+
+/// Extra metadata associated with a thread.
+#[derive(Debug, Clone, Default)]
+struct ThreadExtraState {
+    /// The current vector index in use by the
+    /// thread currently, this is set to None
+    /// after the vector index has been re-used
+    /// and hence the value will never need to be
+    /// read during data-race reporting.
+    vector_index: Option<VectorIdx>,
+
+    /// Thread termination vector clock, this
+    /// is set on thread termination and is used
+    /// for joining on threads since the vector_index
+    /// may be re-used when the join operation occurs.
+    termination_vector_clock: Option<VClock>,
+}
+
+/// Global data-race detection state, contains the currently
+/// executing thread as well as the vector-clocks associated
+/// with each of the threads.
+// FIXME: it is probably better to have one large RefCell, than to have so many small ones.
+#[derive(Debug, Clone)]
+pub struct GlobalState {
+    /// Set to true once the first additional
+    /// thread has launched, due to the dependency
+    /// between before and after a thread launch.
+    /// Any data-races must be recorded after this
+    /// so concurrent execution can ignore recording
+    /// any data-races.
+    multi_threaded: Cell<bool>,
+
+    /// A flag to mark we are currently performing
+    /// a data race free action (such as atomic access)
+    /// to suppress the race detector
+    ongoing_action_data_race_free: Cell<bool>,
+
+    /// Mapping of a vector index to a known set of thread
+    /// clocks, this is not directly mapping from a thread id
+    /// since it may refer to multiple threads.
+    vector_clocks: RefCell<IndexVec<VectorIdx, ThreadClockSet>>,
+
+    /// Mapping of a given vector index to the current thread
+    /// that the execution is representing, this may change
+    /// if a vector index is re-assigned to a new thread.
+    vector_info: RefCell<IndexVec<VectorIdx, ThreadId>>,
+
+    /// The mapping of a given thread to associated thread metadata.
+    thread_info: RefCell<IndexVec<ThreadId, ThreadExtraState>>,
+
+    /// Potential vector indices that could be re-used on thread creation
+    /// values are inserted here on after the thread has terminated and
+    /// been joined with, and hence may potentially become free
+    /// for use as the index for a new thread.
+    /// Elements in this set may still require the vector index to
+    /// report data-races, and can only be re-used after all
+    /// active vector-clocks catch up with the threads timestamp.
+    reuse_candidates: RefCell<FxHashSet<VectorIdx>>,
+
+    /// We make SC fences act like RMWs on a global location.
+    /// To implement that, they all release and acquire into this clock.
+    last_sc_fence: RefCell<VClock>,
+
+    /// The timestamp of last SC write performed by each thread.
+    /// Threads only update their own index here!
+    last_sc_write_per_thread: RefCell<VClock>,
+
+    /// Track when an outdated (weak memory) load happens.
+    pub track_outdated_loads: bool,
+}
+
+impl VisitProvenance for GlobalState {
+    fn visit_provenance(&self, _visit: &mut VisitWith<'_>) {
+        // We don't have any tags.
+    }
+}
+
 impl AccessType {
     fn description(self, ty: Option<Ty<'_>>, size: Option<Size>) -> String {
         let mut msg = String::new();
@@ -309,30 +409,6 @@ impl AccessType {
     }
 }
 
-/// Per-byte vector clock metadata for data-race detection.
-#[derive(Clone, PartialEq, Eq, Debug)]
-struct MemoryCellClocks {
-    /// The vector-clock timestamp and the thread that did the last non-atomic write. We don't need
-    /// a full `VClock` here, it's always a single thread and nothing synchronizes, so the effective
-    /// clock is all-0 except for the thread that did the write.
-    write: (VectorIdx, VTimestamp),
-
-    /// The type of operation that the write index represents,
-    /// either newly allocated memory, a non-atomic write or
-    /// a deallocation of memory.
-    write_type: NaWriteType,
-
-    /// The vector-clock of all non-atomic reads that happened since the last non-atomic write
-    /// (i.e., we join together the "singleton" clocks corresponding to each read). It is reset to
-    /// zero on each write operation.
-    read: VClock,
-
-    /// Atomic access, acquire, release sequence tracking clocks.
-    /// For non-atomic memory this value is set to None.
-    /// For atomic memory, each byte carries this information.
-    atomic_ops: Option<Box<AtomicMemoryCellClocks>>,
-}
-
 impl AtomicMemoryCellClocks {
     fn new(size: Size) -> Self {
         AtomicMemoryCellClocks {
@@ -803,9 +879,17 @@ pub trait EvalContextExt<'tcx>: MiriInterpCxExt<'tcx> {
                         clocks.apply_release_fence();
                     }
                     if atomic == AtomicFenceOrd::SeqCst {
-                        data_race.last_sc_fence.borrow_mut().set_at_index(&clocks.clock, index);
-                        clocks.fence_seqcst.join(&data_race.last_sc_fence.borrow());
-                        clocks.write_seqcst.join(&data_race.last_sc_write.borrow());
+                        // Behave like an RMW on the global fence location. This takes full care of
+                        // all the SC fence requirements, including C++17 §32.4 [atomics.order]
+                        // paragraph 6 (which would limit what future reads can see). It also rules
+                        // out many legal behaviors, but we don't currently have a model that would
+                        // be more precise.
+                        let mut sc_fence_clock = data_race.last_sc_fence.borrow_mut();
+                        sc_fence_clock.join(&clocks.clock);
+                        clocks.clock.join(&sc_fence_clock);
+                        // Also establish some sort of order with the last SC write that happened, globally
+                        // (but this is only respected by future reads).
+                        clocks.write_seqcst.join(&data_race.last_sc_write_per_thread.borrow());
                     }
 
                     // Increment timestamp in case of release semantics.
@@ -1463,80 +1547,6 @@ trait EvalContextPrivExt<'tcx>: MiriInterpCxExt<'tcx> {
     }
 }
 
-/// Extra metadata associated with a thread.
-#[derive(Debug, Clone, Default)]
-struct ThreadExtraState {
-    /// The current vector index in use by the
-    /// thread currently, this is set to None
-    /// after the vector index has been re-used
-    /// and hence the value will never need to be
-    /// read during data-race reporting.
-    vector_index: Option<VectorIdx>,
-
-    /// Thread termination vector clock, this
-    /// is set on thread termination and is used
-    /// for joining on threads since the vector_index
-    /// may be re-used when the join operation occurs.
-    termination_vector_clock: Option<VClock>,
-}
-
-/// Global data-race detection state, contains the currently
-/// executing thread as well as the vector-clocks associated
-/// with each of the threads.
-// FIXME: it is probably better to have one large RefCell, than to have so many small ones.
-#[derive(Debug, Clone)]
-pub struct GlobalState {
-    /// Set to true once the first additional
-    /// thread has launched, due to the dependency
-    /// between before and after a thread launch.
-    /// Any data-races must be recorded after this
-    /// so concurrent execution can ignore recording
-    /// any data-races.
-    multi_threaded: Cell<bool>,
-
-    /// A flag to mark we are currently performing
-    /// a data race free action (such as atomic access)
-    /// to suppress the race detector
-    ongoing_action_data_race_free: Cell<bool>,
-
-    /// Mapping of a vector index to a known set of thread
-    /// clocks, this is not directly mapping from a thread id
-    /// since it may refer to multiple threads.
-    vector_clocks: RefCell<IndexVec<VectorIdx, ThreadClockSet>>,
-
-    /// Mapping of a given vector index to the current thread
-    /// that the execution is representing, this may change
-    /// if a vector index is re-assigned to a new thread.
-    vector_info: RefCell<IndexVec<VectorIdx, ThreadId>>,
-
-    /// The mapping of a given thread to associated thread metadata.
-    thread_info: RefCell<IndexVec<ThreadId, ThreadExtraState>>,
-
-    /// Potential vector indices that could be re-used on thread creation
-    /// values are inserted here on after the thread has terminated and
-    /// been joined with, and hence may potentially become free
-    /// for use as the index for a new thread.
-    /// Elements in this set may still require the vector index to
-    /// report data-races, and can only be re-used after all
-    /// active vector-clocks catch up with the threads timestamp.
-    reuse_candidates: RefCell<FxHashSet<VectorIdx>>,
-
-    /// The timestamp of last SC fence performed by each thread
-    last_sc_fence: RefCell<VClock>,
-
-    /// The timestamp of last SC write performed by each thread
-    last_sc_write: RefCell<VClock>,
-
-    /// Track when an outdated (weak memory) load happens.
-    pub track_outdated_loads: bool,
-}
-
-impl VisitProvenance for GlobalState {
-    fn visit_provenance(&self, _visit: &mut VisitWith<'_>) {
-        // We don't have any tags.
-    }
-}
-
 impl GlobalState {
     /// Create a new global state, setup with just thread-id=0
     /// advanced to timestamp = 1.
@@ -1549,7 +1559,7 @@ impl GlobalState {
             thread_info: RefCell::new(IndexVec::new()),
             reuse_candidates: RefCell::new(FxHashSet::default()),
             last_sc_fence: RefCell::new(VClock::default()),
-            last_sc_write: RefCell::new(VClock::default()),
+            last_sc_write_per_thread: RefCell::new(VClock::default()),
             track_outdated_loads: config.track_outdated_loads,
         };
 
@@ -1851,7 +1861,7 @@ impl GlobalState {
     // SC ATOMIC STORE rule in the paper.
     pub(super) fn sc_write(&self, thread_mgr: &ThreadManager<'_>) {
         let (index, clocks) = self.active_thread_state(thread_mgr);
-        self.last_sc_write.borrow_mut().set_at_index(&clocks.clock, index);
+        self.last_sc_write_per_thread.borrow_mut().set_at_index(&clocks.clock, index);
     }
 
     // SC ATOMIC READ rule in the paper.
diff --git a/src/concurrency/weak_memory.rs b/src/concurrency/weak_memory.rs
index c610f1999f..c88d031283 100644
--- a/src/concurrency/weak_memory.rs
+++ b/src/concurrency/weak_memory.rs
@@ -11,10 +11,17 @@
 //! This implementation is not fully correct under the revised C++20 model and may generate behaviours C++20
 //! disallows (<https://github.com/rust-lang/miri/issues/2301>).
 //!
-//! A modification is made to the paper's model to partially address C++20 changes.
-//! Specifically, if an SC load reads from an atomic store of any ordering, then a later SC load cannot read from
-//! an earlier store in the location's modification order. This is to prevent creating a backwards S edge from the second
-//! load to the first, as a result of C++20's coherence-ordered before rules.
+//! Modifications are made to the paper's model to address C++20 changes:
+//! - If an SC load reads from an atomic store of any ordering, then a later SC load cannot read
+//!   from an earlier store in the location's modification order. This is to prevent creating a
+//!   backwards S edge from the second load to the first, as a result of C++20's coherence-ordered
+//!   before rules. (This seems to rule out behaviors that were actually permitted by the RC11 model
+//!   that C++20 intended to copy (<https://plv.mpi-sws.org/scfix/paper.pdf>); a change was
+//!   introduced when translating the math to English. According to Viktor Vafeiadis, this
+//!   difference is harmless. So we stick to what the standard says, and allow fewer behaviors.)
+//! - SC fences are treated like RMWs to a global clock, to ensure they induce enough
+//!   synchronization with the surrounding accesses. This rules out legal behavior, but it is really
+//!   hard to be more precise here.
 //!
 //! Rust follows the C++20 memory model (except for the Consume ordering and some operations not performable through C++'s
 //! `std::atomic<T>` API). It is therefore possible for this implementation to generate behaviours never observable when the
@@ -138,6 +145,7 @@ struct StoreElement {
 
     /// The timestamp of the storing thread when it performed the store
     timestamp: VTimestamp,
+
     /// The value of this store. `None` means uninitialized.
     // FIXME: Currently, we cannot represent partial initialization.
     val: Option<Scalar>,
@@ -335,11 +343,6 @@ impl<'tcx> StoreBuffer {
                     // then we cannot read-from anything earlier in modification order.
                     // C++20 §6.9.2.2 [intro.races] paragraph 16
                     false
-                } else if store_elem.timestamp <= clocks.fence_seqcst[store_elem.store_index] {
-                    // The current load, which may be sequenced-after an SC fence, cannot read-before
-                    // the last store sequenced-before an SC fence in another thread.
-                    // C++17 §32.4 [atomics.order] paragraph 6
-                    false
                 } else if store_elem.timestamp <= clocks.write_seqcst[store_elem.store_index]
                     && store_elem.is_seqcst
                 {
@@ -356,9 +359,9 @@ impl<'tcx> StoreBuffer {
                     false
                 } else if is_seqcst && store_elem.load_info.borrow().sc_loaded {
                     // The current SC load cannot read-before a store that an earlier SC load has observed.
-                    // See https://github.com/rust-lang/miri/issues/2301#issuecomment-1222720427
+                    // See https://github.com/rust-lang/miri/issues/2301#issuecomment-1222720427.
                     // Consequences of C++20 §31.4 [atomics.order] paragraph 3.1, 3.3 (coherence-ordered before)
-                    // and 4.1 (coherence-ordered before between SC makes global total order S)
+                    // and 4.1 (coherence-ordered before between SC makes global total order S).
                     false
                 } else {
                     true
diff --git a/tests/fail/should-pass/cpp20_rwc_syncs.rs b/tests/fail/should-pass/cpp20_rwc_syncs.rs
deleted file mode 100644
index cebad507ea..0000000000
--- a/tests/fail/should-pass/cpp20_rwc_syncs.rs
+++ /dev/null
@@ -1,87 +0,0 @@
-//@compile-flags: -Zmiri-ignore-leaks
-
-// https://plv.mpi-sws.org/scfix/paper.pdf
-// 2.2 Second Problem: SC Fences are Too Weak
-// This test should pass under the C++20 model Rust is using.
-// Unfortunately, Miri's weak memory emulation only follows the C++11 model
-// as we don't know how to correctly emulate C++20's revised SC semantics,
-// so we have to stick to C++11 emulation from existing research.
-
-use std::sync::atomic::Ordering::*;
-use std::sync::atomic::{AtomicUsize, fence};
-use std::thread::spawn;
-
-// Spins until it reads the given value
-fn reads_value(loc: &AtomicUsize, val: usize) -> usize {
-    while loc.load(Relaxed) != val {
-        std::hint::spin_loop();
-    }
-    val
-}
-
-// We can't create static items because we need to run each test
-// multiple tests
-fn static_atomic(val: usize) -> &'static AtomicUsize {
-    let ret = Box::leak(Box::new(AtomicUsize::new(val)));
-    // A workaround to put the initialization value in the store buffer.
-    // See https://github.com/rust-lang/miri/issues/2164
-    ret.load(Relaxed);
-    ret
-}
-
-fn test_cpp20_rwc_syncs() {
-    /*
-    int main() {
-        atomic_int x = 0;
-        atomic_int y = 0;
-
-        {{{ x.store(1,mo_relaxed);
-        ||| { r1=x.load(mo_relaxed).readsvalue(1);
-              fence(mo_seq_cst);
-              r2=y.load(mo_relaxed); }
-        ||| { y.store(1,mo_relaxed);
-              fence(mo_seq_cst);
-              r3=x.load(mo_relaxed); }
-        }}}
-        return 0;
-    }
-    */
-    let x = static_atomic(0);
-    let y = static_atomic(0);
-
-    let j1 = spawn(move || {
-        x.store(1, Relaxed);
-    });
-
-    let j2 = spawn(move || {
-        reads_value(&x, 1);
-        fence(SeqCst);
-        y.load(Relaxed)
-    });
-
-    let j3 = spawn(move || {
-        y.store(1, Relaxed);
-        fence(SeqCst);
-        x.load(Relaxed)
-    });
-
-    j1.join().unwrap();
-    let b = j2.join().unwrap();
-    let c = j3.join().unwrap();
-
-    // We cannot write assert_ne!() since ui_test's fail
-    // tests expect exit status 1, whereas panics produce 101.
-    // Our ui_test does not yet support overriding failure status codes.
-    if (b, c) == (0, 0) {
-        // This *should* be unreachable, but Miri will reach it.
-        unsafe {
-            std::hint::unreachable_unchecked(); //~ERROR: unreachable
-        }
-    }
-}
-
-pub fn main() {
-    for _ in 0..500 {
-        test_cpp20_rwc_syncs();
-    }
-}
diff --git a/tests/fail/should-pass/cpp20_rwc_syncs.stderr b/tests/fail/should-pass/cpp20_rwc_syncs.stderr
deleted file mode 100644
index 5185845568..0000000000
--- a/tests/fail/should-pass/cpp20_rwc_syncs.stderr
+++ /dev/null
@@ -1,20 +0,0 @@
-error: Undefined Behavior: entering unreachable code
-  --> tests/fail/should-pass/cpp20_rwc_syncs.rs:LL:CC
-   |
-LL |             std::hint::unreachable_unchecked();
-   |             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ entering unreachable code
-   |
-   = help: this indicates a bug in the program: it performed an invalid operation, and caused Undefined Behavior
-   = help: see https://doc.rust-lang.org/nightly/reference/behavior-considered-undefined.html for further information
-   = note: BACKTRACE:
-   = note: inside `test_cpp20_rwc_syncs` at tests/fail/should-pass/cpp20_rwc_syncs.rs:LL:CC
-note: inside `main`
-  --> tests/fail/should-pass/cpp20_rwc_syncs.rs:LL:CC
-   |
-LL |         test_cpp20_rwc_syncs();
-   |         ^^^^^^^^^^^^^^^^^^^^^^
-
-note: some details are omitted, run with `MIRIFLAGS=-Zmiri-backtrace=full` for a verbose backtrace
-
-error: aborting due to 1 previous error
-
diff --git a/tests/pass/0weak_memory_consistency.rs b/tests/pass/0weak_memory_consistency.rs
index 10f7aed941..840dd56939 100644
--- a/tests/pass/0weak_memory_consistency.rs
+++ b/tests/pass/0weak_memory_consistency.rs
@@ -22,7 +22,7 @@
 // Available: https://ss265.host.cs.st-andrews.ac.uk/papers/n3132.pdf.
 
 use std::sync::atomic::Ordering::*;
-use std::sync::atomic::{AtomicBool, AtomicI32, fence};
+use std::sync::atomic::{AtomicBool, AtomicI32, Ordering, fence};
 use std::thread::spawn;
 
 #[derive(Copy, Clone)]
@@ -45,8 +45,8 @@ fn static_atomic_bool(val: bool) -> &'static AtomicBool {
 }
 
 // Spins until it acquires a pre-determined value.
-fn acquires_value(loc: &AtomicI32, val: i32) -> i32 {
-    while loc.load(Acquire) != val {
+fn loads_value(loc: &AtomicI32, ord: Ordering, val: i32) -> i32 {
+    while loc.load(ord) != val {
         std::hint::spin_loop();
     }
     val
@@ -69,7 +69,7 @@ fn test_corr() {
     }); //                                           |                    |
     #[rustfmt::skip] //                              |synchronizes-with   |happens-before
     let j3 = spawn(move || { //                      |                    |
-        acquires_value(&y, 1); // <------------------+                    |
+        loads_value(&y, Acquire, 1); // <------------+                    |
         x.load(Relaxed) // <----------------------------------------------+
         // The two reads on x are ordered by hb, so they cannot observe values
         // differently from the modification order. If the first read observed
@@ -94,12 +94,12 @@ fn test_wrc() {
     }); //                                           |                     |
     #[rustfmt::skip] //                              |synchronizes-with    |
     let j2 = spawn(move || { //                      |                     |
-        acquires_value(&x, 1); // <------------------+                     |
+        loads_value(&x, Acquire, 1); // <------------+                     |
         y.store(1, Release); // ---------------------+                     |happens-before
     }); //                                           |                     |
     #[rustfmt::skip] //                              |synchronizes-with    |
     let j3 = spawn(move || { //                      |                     |
-        acquires_value(&y, 1); // <------------------+                     |
+        loads_value(&y, Acquire, 1); // <------------+                     |
         x.load(Relaxed) // <-----------------------------------------------+
     });
 
@@ -125,7 +125,7 @@ fn test_message_passing() {
     #[rustfmt::skip] //                              |synchronizes-with  | happens-before
     let j2 = spawn(move || { //                      |                   |
         let x = x; // avoid field capturing          |                   |
-        acquires_value(&y, 1); // <------------------+                   |
+        loads_value(&y, Acquire, 1); // <------------+                   |
         unsafe { *x.0 } // <---------------------------------------------+
     });
 
@@ -268,9 +268,6 @@ fn test_iriw_sc_rlx() {
     let x = static_atomic_bool(false);
     let y = static_atomic_bool(false);
 
-    x.store(false, Relaxed);
-    y.store(false, Relaxed);
-
     let a = spawn(move || x.store(true, Relaxed));
     let b = spawn(move || y.store(true, Relaxed));
     let c = spawn(move || {
@@ -290,6 +287,84 @@ fn test_iriw_sc_rlx() {
     assert!(c || d);
 }
 
+// Similar to `test_iriw_sc_rlx` but with fences instead of SC accesses.
+fn test_cpp20_sc_fence_fix() {
+    let x = static_atomic_bool(false);
+    let y = static_atomic_bool(false);
+
+    let thread1 = spawn(|| {
+        let a = x.load(Relaxed);
+        fence(SeqCst);
+        let b = y.load(Relaxed);
+        (a, b)
+    });
+
+    let thread2 = spawn(|| {
+        x.store(true, Relaxed);
+    });
+    let thread3 = spawn(|| {
+        y.store(true, Relaxed);
+    });
+
+    let thread4 = spawn(|| {
+        let c = y.load(Relaxed);
+        fence(SeqCst);
+        let d = x.load(Relaxed);
+        (c, d)
+    });
+
+    let (a, b) = thread1.join().unwrap();
+    thread2.join().unwrap();
+    thread3.join().unwrap();
+    let (c, d) = thread4.join().unwrap();
+    let bad = a == true && b == false && c == true && d == false;
+    assert!(!bad);
+}
+
+// https://plv.mpi-sws.org/scfix/paper.pdf
+// 2.2 Second Problem: SC Fences are Too Weak
+fn test_cpp20_rwc_syncs() {
+    /*
+    int main() {
+        atomic_int x = 0;
+        atomic_int y = 0;
+        {{{ x.store(1,mo_relaxed);
+        ||| { r1=x.load(mo_relaxed).readsvalue(1);
+              fence(mo_seq_cst);
+              r2=y.load(mo_relaxed); }
+        ||| { y.store(1,mo_relaxed);
+              fence(mo_seq_cst);
+              r3=x.load(mo_relaxed); }
+        }}}
+        return 0;
+    }
+    */
+    let x = static_atomic(0);
+    let y = static_atomic(0);
+
+    let j1 = spawn(move || {
+        x.store(1, Relaxed);
+    });
+
+    let j2 = spawn(move || {
+        loads_value(&x, Relaxed, 1);
+        fence(SeqCst);
+        y.load(Relaxed)
+    });
+
+    let j3 = spawn(move || {
+        y.store(1, Relaxed);
+        fence(SeqCst);
+        x.load(Relaxed)
+    });
+
+    j1.join().unwrap();
+    let b = j2.join().unwrap();
+    let c = j3.join().unwrap();
+
+    assert!((b, c) != (0, 0));
+}
+
 pub fn main() {
     for _ in 0..50 {
         test_single_thread();
@@ -301,5 +376,7 @@ pub fn main() {
         test_sc_store_buffering();
         test_sync_through_rmw_and_fences();
         test_iriw_sc_rlx();
+        test_cpp20_sc_fence_fix();
+        test_cpp20_rwc_syncs();
     }
 }