diff --git a/README.md b/README.md index 9a683ae68f..4e30dea18f 100644 --- a/README.md +++ b/README.md @@ -68,9 +68,10 @@ Further caveats that Miri users should be aware of: not support networking. System API support varies between targets; if you run on Windows it is a good idea to use `--target x86_64-unknown-linux-gnu` to get better support. -* Weak memory emulation may [produce weak behaviors](https://github.com/rust-lang/miri/issues/2301) - when `SeqCst` fences are used that are not actually permitted by the Rust memory model, and it - cannot produce all behaviors possibly observable on real hardware. +* Weak memory emulation is not complete: there are legal behaviors that Miri will never produce. + However, Miri produces many behaviors that are hard to observe on real hardware, so it can help + quite a bit in finding weak memory concurrency bugs. To be really sure about complicated atomic + code, use specialized tools such as [loom](https://github.com/tokio-rs/loom). Moreover, Miri fundamentally cannot ensure that your code is *sound*. [Soundness] is the property of never causing undefined behavior when invoked from arbitrary safe code, even in combination with diff --git a/src/concurrency/data_race.rs b/src/concurrency/data_race.rs index f86d1eb1dc..5cb68634b7 100644 --- a/src/concurrency/data_race.rs +++ b/src/concurrency/data_race.rs @@ -108,19 +108,19 @@ pub(super) struct ThreadClockSet { fence_acquire: VClock, /// The last timestamp of happens-before relations that - /// have been released by this thread by a fence. + /// have been released by this thread by a release fence. fence_release: VClock, - /// Timestamps of the last SC fence performed by each - /// thread, updated when this thread performs an SC fence - pub(super) fence_seqcst: VClock, - /// Timestamps of the last SC write performed by each - /// thread, updated when this thread performs an SC fence + /// thread, updated when this thread performs an SC fence. + /// This is never acquired into the thread's clock, it + /// just limits which old writes can be seen in weak memory emulation. pub(super) write_seqcst: VClock, /// Timestamps of the last SC fence performed by each - /// thread, updated when this thread performs an SC read + /// thread, updated when this thread performs an SC read. + /// This is never acquired into the thread's clock, it + /// just limits which old writes can be seen in weak memory emulation. pub(super) read_seqcst: VClock, } @@ -256,6 +256,106 @@ enum AccessType { AtomicRmw, } +/// Per-byte vector clock metadata for data-race detection. +#[derive(Clone, PartialEq, Eq, Debug)] +struct MemoryCellClocks { + /// The vector-clock timestamp and the thread that did the last non-atomic write. We don't need + /// a full `VClock` here, it's always a single thread and nothing synchronizes, so the effective + /// clock is all-0 except for the thread that did the write. + write: (VectorIdx, VTimestamp), + + /// The type of operation that the write index represents, + /// either newly allocated memory, a non-atomic write or + /// a deallocation of memory. + write_type: NaWriteType, + + /// The vector-clock of all non-atomic reads that happened since the last non-atomic write + /// (i.e., we join together the "singleton" clocks corresponding to each read). It is reset to + /// zero on each write operation. + read: VClock, + + /// Atomic access, acquire, release sequence tracking clocks. + /// For non-atomic memory this value is set to None. + /// For atomic memory, each byte carries this information. + atomic_ops: Option>, +} + +/// Extra metadata associated with a thread. +#[derive(Debug, Clone, Default)] +struct ThreadExtraState { + /// The current vector index in use by the + /// thread currently, this is set to None + /// after the vector index has been re-used + /// and hence the value will never need to be + /// read during data-race reporting. + vector_index: Option, + + /// Thread termination vector clock, this + /// is set on thread termination and is used + /// for joining on threads since the vector_index + /// may be re-used when the join operation occurs. + termination_vector_clock: Option, +} + +/// Global data-race detection state, contains the currently +/// executing thread as well as the vector-clocks associated +/// with each of the threads. +// FIXME: it is probably better to have one large RefCell, than to have so many small ones. +#[derive(Debug, Clone)] +pub struct GlobalState { + /// Set to true once the first additional + /// thread has launched, due to the dependency + /// between before and after a thread launch. + /// Any data-races must be recorded after this + /// so concurrent execution can ignore recording + /// any data-races. + multi_threaded: Cell, + + /// A flag to mark we are currently performing + /// a data race free action (such as atomic access) + /// to suppress the race detector + ongoing_action_data_race_free: Cell, + + /// Mapping of a vector index to a known set of thread + /// clocks, this is not directly mapping from a thread id + /// since it may refer to multiple threads. + vector_clocks: RefCell>, + + /// Mapping of a given vector index to the current thread + /// that the execution is representing, this may change + /// if a vector index is re-assigned to a new thread. + vector_info: RefCell>, + + /// The mapping of a given thread to associated thread metadata. + thread_info: RefCell>, + + /// Potential vector indices that could be re-used on thread creation + /// values are inserted here on after the thread has terminated and + /// been joined with, and hence may potentially become free + /// for use as the index for a new thread. + /// Elements in this set may still require the vector index to + /// report data-races, and can only be re-used after all + /// active vector-clocks catch up with the threads timestamp. + reuse_candidates: RefCell>, + + /// We make SC fences act like RMWs on a global location. + /// To implement that, they all release and acquire into this clock. + last_sc_fence: RefCell, + + /// The timestamp of last SC write performed by each thread. + /// Threads only update their own index here! + last_sc_write_per_thread: RefCell, + + /// Track when an outdated (weak memory) load happens. + pub track_outdated_loads: bool, +} + +impl VisitProvenance for GlobalState { + fn visit_provenance(&self, _visit: &mut VisitWith<'_>) { + // We don't have any tags. + } +} + impl AccessType { fn description(self, ty: Option>, size: Option) -> String { let mut msg = String::new(); @@ -309,30 +409,6 @@ impl AccessType { } } -/// Per-byte vector clock metadata for data-race detection. -#[derive(Clone, PartialEq, Eq, Debug)] -struct MemoryCellClocks { - /// The vector-clock timestamp and the thread that did the last non-atomic write. We don't need - /// a full `VClock` here, it's always a single thread and nothing synchronizes, so the effective - /// clock is all-0 except for the thread that did the write. - write: (VectorIdx, VTimestamp), - - /// The type of operation that the write index represents, - /// either newly allocated memory, a non-atomic write or - /// a deallocation of memory. - write_type: NaWriteType, - - /// The vector-clock of all non-atomic reads that happened since the last non-atomic write - /// (i.e., we join together the "singleton" clocks corresponding to each read). It is reset to - /// zero on each write operation. - read: VClock, - - /// Atomic access, acquire, release sequence tracking clocks. - /// For non-atomic memory this value is set to None. - /// For atomic memory, each byte carries this information. - atomic_ops: Option>, -} - impl AtomicMemoryCellClocks { fn new(size: Size) -> Self { AtomicMemoryCellClocks { @@ -803,9 +879,17 @@ pub trait EvalContextExt<'tcx>: MiriInterpCxExt<'tcx> { clocks.apply_release_fence(); } if atomic == AtomicFenceOrd::SeqCst { - data_race.last_sc_fence.borrow_mut().set_at_index(&clocks.clock, index); - clocks.fence_seqcst.join(&data_race.last_sc_fence.borrow()); - clocks.write_seqcst.join(&data_race.last_sc_write.borrow()); + // Behave like an RMW on the global fence location. This takes full care of + // all the SC fence requirements, including C++17 §32.4 [atomics.order] + // paragraph 6 (which would limit what future reads can see). It also rules + // out many legal behaviors, but we don't currently have a model that would + // be more precise. + let mut sc_fence_clock = data_race.last_sc_fence.borrow_mut(); + sc_fence_clock.join(&clocks.clock); + clocks.clock.join(&sc_fence_clock); + // Also establish some sort of order with the last SC write that happened, globally + // (but this is only respected by future reads). + clocks.write_seqcst.join(&data_race.last_sc_write_per_thread.borrow()); } // Increment timestamp in case of release semantics. @@ -1463,80 +1547,6 @@ trait EvalContextPrivExt<'tcx>: MiriInterpCxExt<'tcx> { } } -/// Extra metadata associated with a thread. -#[derive(Debug, Clone, Default)] -struct ThreadExtraState { - /// The current vector index in use by the - /// thread currently, this is set to None - /// after the vector index has been re-used - /// and hence the value will never need to be - /// read during data-race reporting. - vector_index: Option, - - /// Thread termination vector clock, this - /// is set on thread termination and is used - /// for joining on threads since the vector_index - /// may be re-used when the join operation occurs. - termination_vector_clock: Option, -} - -/// Global data-race detection state, contains the currently -/// executing thread as well as the vector-clocks associated -/// with each of the threads. -// FIXME: it is probably better to have one large RefCell, than to have so many small ones. -#[derive(Debug, Clone)] -pub struct GlobalState { - /// Set to true once the first additional - /// thread has launched, due to the dependency - /// between before and after a thread launch. - /// Any data-races must be recorded after this - /// so concurrent execution can ignore recording - /// any data-races. - multi_threaded: Cell, - - /// A flag to mark we are currently performing - /// a data race free action (such as atomic access) - /// to suppress the race detector - ongoing_action_data_race_free: Cell, - - /// Mapping of a vector index to a known set of thread - /// clocks, this is not directly mapping from a thread id - /// since it may refer to multiple threads. - vector_clocks: RefCell>, - - /// Mapping of a given vector index to the current thread - /// that the execution is representing, this may change - /// if a vector index is re-assigned to a new thread. - vector_info: RefCell>, - - /// The mapping of a given thread to associated thread metadata. - thread_info: RefCell>, - - /// Potential vector indices that could be re-used on thread creation - /// values are inserted here on after the thread has terminated and - /// been joined with, and hence may potentially become free - /// for use as the index for a new thread. - /// Elements in this set may still require the vector index to - /// report data-races, and can only be re-used after all - /// active vector-clocks catch up with the threads timestamp. - reuse_candidates: RefCell>, - - /// The timestamp of last SC fence performed by each thread - last_sc_fence: RefCell, - - /// The timestamp of last SC write performed by each thread - last_sc_write: RefCell, - - /// Track when an outdated (weak memory) load happens. - pub track_outdated_loads: bool, -} - -impl VisitProvenance for GlobalState { - fn visit_provenance(&self, _visit: &mut VisitWith<'_>) { - // We don't have any tags. - } -} - impl GlobalState { /// Create a new global state, setup with just thread-id=0 /// advanced to timestamp = 1. @@ -1549,7 +1559,7 @@ impl GlobalState { thread_info: RefCell::new(IndexVec::new()), reuse_candidates: RefCell::new(FxHashSet::default()), last_sc_fence: RefCell::new(VClock::default()), - last_sc_write: RefCell::new(VClock::default()), + last_sc_write_per_thread: RefCell::new(VClock::default()), track_outdated_loads: config.track_outdated_loads, }; @@ -1851,7 +1861,7 @@ impl GlobalState { // SC ATOMIC STORE rule in the paper. pub(super) fn sc_write(&self, thread_mgr: &ThreadManager<'_>) { let (index, clocks) = self.active_thread_state(thread_mgr); - self.last_sc_write.borrow_mut().set_at_index(&clocks.clock, index); + self.last_sc_write_per_thread.borrow_mut().set_at_index(&clocks.clock, index); } // SC ATOMIC READ rule in the paper. diff --git a/src/concurrency/weak_memory.rs b/src/concurrency/weak_memory.rs index c610f1999f..c88d031283 100644 --- a/src/concurrency/weak_memory.rs +++ b/src/concurrency/weak_memory.rs @@ -11,10 +11,17 @@ //! This implementation is not fully correct under the revised C++20 model and may generate behaviours C++20 //! disallows (). //! -//! A modification is made to the paper's model to partially address C++20 changes. -//! Specifically, if an SC load reads from an atomic store of any ordering, then a later SC load cannot read from -//! an earlier store in the location's modification order. This is to prevent creating a backwards S edge from the second -//! load to the first, as a result of C++20's coherence-ordered before rules. +//! Modifications are made to the paper's model to address C++20 changes: +//! - If an SC load reads from an atomic store of any ordering, then a later SC load cannot read +//! from an earlier store in the location's modification order. This is to prevent creating a +//! backwards S edge from the second load to the first, as a result of C++20's coherence-ordered +//! before rules. (This seems to rule out behaviors that were actually permitted by the RC11 model +//! that C++20 intended to copy (); a change was +//! introduced when translating the math to English. According to Viktor Vafeiadis, this +//! difference is harmless. So we stick to what the standard says, and allow fewer behaviors.) +//! - SC fences are treated like RMWs to a global clock, to ensure they induce enough +//! synchronization with the surrounding accesses. This rules out legal behavior, but it is really +//! hard to be more precise here. //! //! Rust follows the C++20 memory model (except for the Consume ordering and some operations not performable through C++'s //! `std::atomic` API). It is therefore possible for this implementation to generate behaviours never observable when the @@ -138,6 +145,7 @@ struct StoreElement { /// The timestamp of the storing thread when it performed the store timestamp: VTimestamp, + /// The value of this store. `None` means uninitialized. // FIXME: Currently, we cannot represent partial initialization. val: Option, @@ -335,11 +343,6 @@ impl<'tcx> StoreBuffer { // then we cannot read-from anything earlier in modification order. // C++20 §6.9.2.2 [intro.races] paragraph 16 false - } else if store_elem.timestamp <= clocks.fence_seqcst[store_elem.store_index] { - // The current load, which may be sequenced-after an SC fence, cannot read-before - // the last store sequenced-before an SC fence in another thread. - // C++17 §32.4 [atomics.order] paragraph 6 - false } else if store_elem.timestamp <= clocks.write_seqcst[store_elem.store_index] && store_elem.is_seqcst { @@ -356,9 +359,9 @@ impl<'tcx> StoreBuffer { false } else if is_seqcst && store_elem.load_info.borrow().sc_loaded { // The current SC load cannot read-before a store that an earlier SC load has observed. - // See https://github.com/rust-lang/miri/issues/2301#issuecomment-1222720427 + // See https://github.com/rust-lang/miri/issues/2301#issuecomment-1222720427. // Consequences of C++20 §31.4 [atomics.order] paragraph 3.1, 3.3 (coherence-ordered before) - // and 4.1 (coherence-ordered before between SC makes global total order S) + // and 4.1 (coherence-ordered before between SC makes global total order S). false } else { true diff --git a/tests/fail/should-pass/cpp20_rwc_syncs.rs b/tests/fail/should-pass/cpp20_rwc_syncs.rs deleted file mode 100644 index cebad507ea..0000000000 --- a/tests/fail/should-pass/cpp20_rwc_syncs.rs +++ /dev/null @@ -1,87 +0,0 @@ -//@compile-flags: -Zmiri-ignore-leaks - -// https://plv.mpi-sws.org/scfix/paper.pdf -// 2.2 Second Problem: SC Fences are Too Weak -// This test should pass under the C++20 model Rust is using. -// Unfortunately, Miri's weak memory emulation only follows the C++11 model -// as we don't know how to correctly emulate C++20's revised SC semantics, -// so we have to stick to C++11 emulation from existing research. - -use std::sync::atomic::Ordering::*; -use std::sync::atomic::{AtomicUsize, fence}; -use std::thread::spawn; - -// Spins until it reads the given value -fn reads_value(loc: &AtomicUsize, val: usize) -> usize { - while loc.load(Relaxed) != val { - std::hint::spin_loop(); - } - val -} - -// We can't create static items because we need to run each test -// multiple tests -fn static_atomic(val: usize) -> &'static AtomicUsize { - let ret = Box::leak(Box::new(AtomicUsize::new(val))); - // A workaround to put the initialization value in the store buffer. - // See https://github.com/rust-lang/miri/issues/2164 - ret.load(Relaxed); - ret -} - -fn test_cpp20_rwc_syncs() { - /* - int main() { - atomic_int x = 0; - atomic_int y = 0; - - {{{ x.store(1,mo_relaxed); - ||| { r1=x.load(mo_relaxed).readsvalue(1); - fence(mo_seq_cst); - r2=y.load(mo_relaxed); } - ||| { y.store(1,mo_relaxed); - fence(mo_seq_cst); - r3=x.load(mo_relaxed); } - }}} - return 0; - } - */ - let x = static_atomic(0); - let y = static_atomic(0); - - let j1 = spawn(move || { - x.store(1, Relaxed); - }); - - let j2 = spawn(move || { - reads_value(&x, 1); - fence(SeqCst); - y.load(Relaxed) - }); - - let j3 = spawn(move || { - y.store(1, Relaxed); - fence(SeqCst); - x.load(Relaxed) - }); - - j1.join().unwrap(); - let b = j2.join().unwrap(); - let c = j3.join().unwrap(); - - // We cannot write assert_ne!() since ui_test's fail - // tests expect exit status 1, whereas panics produce 101. - // Our ui_test does not yet support overriding failure status codes. - if (b, c) == (0, 0) { - // This *should* be unreachable, but Miri will reach it. - unsafe { - std::hint::unreachable_unchecked(); //~ERROR: unreachable - } - } -} - -pub fn main() { - for _ in 0..500 { - test_cpp20_rwc_syncs(); - } -} diff --git a/tests/fail/should-pass/cpp20_rwc_syncs.stderr b/tests/fail/should-pass/cpp20_rwc_syncs.stderr deleted file mode 100644 index 5185845568..0000000000 --- a/tests/fail/should-pass/cpp20_rwc_syncs.stderr +++ /dev/null @@ -1,20 +0,0 @@ -error: Undefined Behavior: entering unreachable code - --> tests/fail/should-pass/cpp20_rwc_syncs.rs:LL:CC - | -LL | std::hint::unreachable_unchecked(); - | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ entering unreachable code - | - = help: this indicates a bug in the program: it performed an invalid operation, and caused Undefined Behavior - = help: see https://doc.rust-lang.org/nightly/reference/behavior-considered-undefined.html for further information - = note: BACKTRACE: - = note: inside `test_cpp20_rwc_syncs` at tests/fail/should-pass/cpp20_rwc_syncs.rs:LL:CC -note: inside `main` - --> tests/fail/should-pass/cpp20_rwc_syncs.rs:LL:CC - | -LL | test_cpp20_rwc_syncs(); - | ^^^^^^^^^^^^^^^^^^^^^^ - -note: some details are omitted, run with `MIRIFLAGS=-Zmiri-backtrace=full` for a verbose backtrace - -error: aborting due to 1 previous error - diff --git a/tests/pass/0weak_memory_consistency.rs b/tests/pass/0weak_memory_consistency.rs index 10f7aed941..840dd56939 100644 --- a/tests/pass/0weak_memory_consistency.rs +++ b/tests/pass/0weak_memory_consistency.rs @@ -22,7 +22,7 @@ // Available: https://ss265.host.cs.st-andrews.ac.uk/papers/n3132.pdf. use std::sync::atomic::Ordering::*; -use std::sync::atomic::{AtomicBool, AtomicI32, fence}; +use std::sync::atomic::{AtomicBool, AtomicI32, Ordering, fence}; use std::thread::spawn; #[derive(Copy, Clone)] @@ -45,8 +45,8 @@ fn static_atomic_bool(val: bool) -> &'static AtomicBool { } // Spins until it acquires a pre-determined value. -fn acquires_value(loc: &AtomicI32, val: i32) -> i32 { - while loc.load(Acquire) != val { +fn loads_value(loc: &AtomicI32, ord: Ordering, val: i32) -> i32 { + while loc.load(ord) != val { std::hint::spin_loop(); } val @@ -69,7 +69,7 @@ fn test_corr() { }); // | | #[rustfmt::skip] // |synchronizes-with |happens-before let j3 = spawn(move || { // | | - acquires_value(&y, 1); // <------------------+ | + loads_value(&y, Acquire, 1); // <------------+ | x.load(Relaxed) // <----------------------------------------------+ // The two reads on x are ordered by hb, so they cannot observe values // differently from the modification order. If the first read observed @@ -94,12 +94,12 @@ fn test_wrc() { }); // | | #[rustfmt::skip] // |synchronizes-with | let j2 = spawn(move || { // | | - acquires_value(&x, 1); // <------------------+ | + loads_value(&x, Acquire, 1); // <------------+ | y.store(1, Release); // ---------------------+ |happens-before }); // | | #[rustfmt::skip] // |synchronizes-with | let j3 = spawn(move || { // | | - acquires_value(&y, 1); // <------------------+ | + loads_value(&y, Acquire, 1); // <------------+ | x.load(Relaxed) // <-----------------------------------------------+ }); @@ -125,7 +125,7 @@ fn test_message_passing() { #[rustfmt::skip] // |synchronizes-with | happens-before let j2 = spawn(move || { // | | let x = x; // avoid field capturing | | - acquires_value(&y, 1); // <------------------+ | + loads_value(&y, Acquire, 1); // <------------+ | unsafe { *x.0 } // <---------------------------------------------+ }); @@ -268,9 +268,6 @@ fn test_iriw_sc_rlx() { let x = static_atomic_bool(false); let y = static_atomic_bool(false); - x.store(false, Relaxed); - y.store(false, Relaxed); - let a = spawn(move || x.store(true, Relaxed)); let b = spawn(move || y.store(true, Relaxed)); let c = spawn(move || { @@ -290,6 +287,84 @@ fn test_iriw_sc_rlx() { assert!(c || d); } +// Similar to `test_iriw_sc_rlx` but with fences instead of SC accesses. +fn test_cpp20_sc_fence_fix() { + let x = static_atomic_bool(false); + let y = static_atomic_bool(false); + + let thread1 = spawn(|| { + let a = x.load(Relaxed); + fence(SeqCst); + let b = y.load(Relaxed); + (a, b) + }); + + let thread2 = spawn(|| { + x.store(true, Relaxed); + }); + let thread3 = spawn(|| { + y.store(true, Relaxed); + }); + + let thread4 = spawn(|| { + let c = y.load(Relaxed); + fence(SeqCst); + let d = x.load(Relaxed); + (c, d) + }); + + let (a, b) = thread1.join().unwrap(); + thread2.join().unwrap(); + thread3.join().unwrap(); + let (c, d) = thread4.join().unwrap(); + let bad = a == true && b == false && c == true && d == false; + assert!(!bad); +} + +// https://plv.mpi-sws.org/scfix/paper.pdf +// 2.2 Second Problem: SC Fences are Too Weak +fn test_cpp20_rwc_syncs() { + /* + int main() { + atomic_int x = 0; + atomic_int y = 0; + {{{ x.store(1,mo_relaxed); + ||| { r1=x.load(mo_relaxed).readsvalue(1); + fence(mo_seq_cst); + r2=y.load(mo_relaxed); } + ||| { y.store(1,mo_relaxed); + fence(mo_seq_cst); + r3=x.load(mo_relaxed); } + }}} + return 0; + } + */ + let x = static_atomic(0); + let y = static_atomic(0); + + let j1 = spawn(move || { + x.store(1, Relaxed); + }); + + let j2 = spawn(move || { + loads_value(&x, Relaxed, 1); + fence(SeqCst); + y.load(Relaxed) + }); + + let j3 = spawn(move || { + y.store(1, Relaxed); + fence(SeqCst); + x.load(Relaxed) + }); + + j1.join().unwrap(); + let b = j2.join().unwrap(); + let c = j3.join().unwrap(); + + assert!((b, c) != (0, 0)); +} + pub fn main() { for _ in 0..50 { test_single_thread(); @@ -301,5 +376,7 @@ pub fn main() { test_sc_store_buffering(); test_sync_through_rmw_and_fences(); test_iriw_sc_rlx(); + test_cpp20_sc_fence_fix(); + test_cpp20_rwc_syncs(); } }