diff --git a/Cargo.lock b/Cargo.lock index 151a10a774a..84097cd7086 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -34,7 +34,7 @@ version = "0.7.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "891477e0c6a8957309ee5c45a6368af3ae14bb510732d2684ffa19af310920f9" dependencies = [ - "getrandom 0.2.16", + "getrandom 0.2.13", "once_cell", "version_check", ] @@ -276,6 +276,12 @@ dependencies = [ "syn 2.0.107", ] +[[package]] +name = "async-task" +version = "4.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b75356056920673b02621b35afd0f7dda9306d03c79a30f5c56c44cf256e3de" + [[package]] name = "async-trait" version = "0.1.89" @@ -2551,9 +2557,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.16" +version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" +checksum = "a06fddc2749e0528d2813f95e050e87e52c8cbbae56223b9babf73b3e53b0cc6" dependencies = [ "cfg-if", "js-sys", @@ -6113,7 +6119,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom 0.2.16", + "getrandom 0.2.13", ] [[package]] @@ -6187,7 +6193,7 @@ version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" dependencies = [ - "getrandom 0.2.16", + "getrandom 0.2.13", "libredox", "thiserror 1.0.69", ] @@ -6426,7 +6432,7 @@ checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" dependencies = [ "cc", "cfg-if", - "getrandom 0.2.16", + "getrandom 0.2.13", "libc", "untrusted", "windows-sys 0.52.0", @@ -7659,7 +7665,7 @@ checksum = "db18cb19c7499ba4a65b1504442179a7e4aba487dc35978d90966c5ca02ee16b" dependencies = [ "bytemuck", "derive_more 0.99.20", - "getrandom 0.2.16", + "getrandom 0.2.13", "log", "rand 0.8.5", "scoped-tls", @@ -7678,7 +7684,7 @@ dependencies = [ "bytemuck", "bytes", "derive_more 0.99.20", - "getrandom 0.2.16", + "getrandom 0.2.13", "http 1.3.1", "insta", "log", @@ -8103,6 +8109,7 @@ dependencies = [ "spacetimedb-physical-plan", "spacetimedb-primitives 2.2.0", "spacetimedb-query", + "spacetimedb-runtime", "spacetimedb-sats 2.2.0", "spacetimedb-schema", "spacetimedb-snapshot", @@ -8187,6 +8194,28 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "spacetimedb-dst" +version = "2.2.0" +dependencies = [ + "anyhow", + "clap 4.5.50", + "futures-util", + "spacetimedb-commitlog", + "spacetimedb-core", + "spacetimedb-datastore", + "spacetimedb-durability", + "spacetimedb-lib 2.2.0", + "spacetimedb-primitives 2.2.0", + "spacetimedb-runtime", + "spacetimedb-sats 2.2.0", + "spacetimedb-schema", + "spacetimedb-snapshot", + "spacetimedb-table", + "tracing", + "tracing-subscriber", +] + [[package]] name = "spacetimedb-durability" version = "2.2.0" @@ -8200,6 +8229,7 @@ dependencies = [ "spacetimedb-commitlog", "spacetimedb-fs-utils", "spacetimedb-paths", + "spacetimedb-runtime", "spacetimedb-sats 2.2.0", "tempfile", "thiserror 1.0.69", @@ -8468,6 +8498,17 @@ dependencies = [ "spacetimedb-lib 2.2.0", ] +[[package]] +name = "spacetimedb-runtime" +version = "2.2.0" +dependencies = [ + "async-task", + "futures", + "libc", + "spin", + "tokio", +] + [[package]] name = "spacetimedb-sats" version = "1.9.0" @@ -8648,6 +8689,7 @@ dependencies = [ "spacetimedb-lib 2.2.0", "spacetimedb-paths", "spacetimedb-primitives 2.2.0", + "spacetimedb-runtime", "spacetimedb-sats 2.2.0", "spacetimedb-schema", "spacetimedb-table", diff --git a/Cargo.toml b/Cargo.toml index d1488e186df..094ad0d6b01 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,7 @@ members = [ "crates/commitlog", "crates/core", "crates/data-structures", + "crates/dst", "crates/datastore", "crates/durability", "crates/execution", @@ -25,6 +26,7 @@ members = [ "crates/physical-plan", "crates/primitives", "crates/query", + "crates/runtime", "crates/sats", "crates/schema", "crates/smoketests", @@ -139,6 +141,7 @@ spacetimedb-pg = { path = "crates/pg", version = "=2.2.0" } spacetimedb-physical-plan = { path = "crates/physical-plan", version = "=2.2.0" } spacetimedb-primitives = { path = "crates/primitives", version = "=2.2.0" } spacetimedb-query = { path = "crates/query", version = "=2.2.0" } +spacetimedb-runtime = { path = "crates/runtime", version = "=2.2.0" } spacetimedb-sats = { path = "crates/sats", version = "=2.2.0" } spacetimedb-schema = { path = "crates/schema", version = "=2.2.0" } spacetimedb-standalone = { path = "crates/standalone", version = "=2.2.0" } diff --git a/crates/commitlog/src/lib.rs b/crates/commitlog/src/lib.rs index abc8729c978..7123b7e3bb6 100644 --- a/crates/commitlog/src/lib.rs +++ b/crates/commitlog/src/lib.rs @@ -156,7 +156,7 @@ impl Options { /// The canonical commitlog API over a repository backend `R`. /// /// The default backend is the on-disk filesystem repository -/// [`repo::Fs`], but tests may supply another [`Repo`] +/// [`repo::Fs`], but tests and simulators may supply another [`Repo`] /// implementation. /// /// Records in the log are of type `T`, which canonically is instantiated to @@ -203,7 +203,7 @@ where { /// Open the log in `repo` with [`Options`]. /// - /// This is useful for tests which provide a repository + /// This is useful for tests and simulators which provide a repository /// implementation other than [`repo::Fs`]. pub fn open_with_repo(repo: R, opts: Options) -> io::Result { let inner = commitlog::Generic::open(repo, opts)?; diff --git a/crates/commitlog/src/repo/mod.rs b/crates/commitlog/src/repo/mod.rs index 3d79f7f1e28..76c5d2e365b 100644 --- a/crates/commitlog/src/repo/mod.rs +++ b/crates/commitlog/src/repo/mod.rs @@ -161,6 +161,8 @@ pub trait RepoWithoutLockFile: Repo {} impl RepoWithoutLockFile for &T {} +impl RepoWithoutLockFile for Fs {} + #[cfg(any(test, feature = "test"))] impl RepoWithoutLockFile for Memory {} diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index acdc578080d..6e7075536c2 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -28,6 +28,7 @@ spacetimedb-primitives.workspace = true spacetimedb-paths.workspace = true spacetimedb-physical-plan.workspace = true spacetimedb-query.workspace = true +spacetimedb-runtime = { workspace = true, features = ["tokio"] } spacetimedb-sats = { workspace = true, features = ["serde"] } spacetimedb-schema.workspace = true spacetimedb-table.workspace = true diff --git a/crates/core/src/database_logger.rs b/crates/core/src/database_logger.rs index 0e202229dea..f194cb60a48 100644 --- a/crates/core/src/database_logger.rs +++ b/crates/core/src/database_logger.rs @@ -11,7 +11,7 @@ use std::path::Path; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; -use tokio::io::{AsyncRead, BufReader}; +use tokio::io::{AsyncRead, BufReader, ReadBuf}; use tokio::sync::{broadcast, mpsc, oneshot}; use tokio_stream::wrappers::errors::BroadcastStreamRecvError; use tokio_stream::wrappers::BroadcastStream; @@ -592,7 +592,7 @@ fn seek_to(file: &mut File, buf: &mut [u8], num_lines: u32) -> io::Result<()> { Ok(()) } -fn read_exact_at(file: &std::fs::File, buf: &mut [u8], offset: u64) -> io::Result<()> { +fn read_exact_at(file: &File, buf: &mut [u8], offset: u64) -> io::Result<()> { #[cfg(unix)] { use std::os::unix::fs::FileExt; @@ -641,7 +641,7 @@ impl MaybeFile { } impl AsyncRead for MaybeFile { - fn poll_read(self: Pin<&mut Self>, cx: &mut Context<'_>, buf: &mut tokio::io::ReadBuf<'_>) -> Poll> { + fn poll_read(self: Pin<&mut Self>, cx: &mut Context<'_>, buf: &mut ReadBuf<'_>) -> Poll> { match self.project() { MaybeFileProj::File { inner } => inner.poll_read(cx, buf), MaybeFileProj::Empty => Poll::Ready(Ok(())), diff --git a/crates/core/src/db/durability.rs b/crates/core/src/db/durability.rs index c17a10e9f63..07ec4d356c3 100644 --- a/crates/core/src/db/durability.rs +++ b/crates/core/src/db/durability.rs @@ -9,9 +9,9 @@ use spacetimedb_datastore::{execution_context::ReducerContext, traits::TxData}; use spacetimedb_durability::Transaction; use spacetimedb_lib::Identity; use spacetimedb_sats::ProductValue; -use tokio::{runtime, time::timeout}; use crate::db::persistence::Durability; +use spacetimedb_runtime::Handle; pub(super) fn request_durability( durability: &Durability, @@ -32,11 +32,11 @@ pub(super) fn request_durability( })); } -pub(super) fn spawn_close(durability: Arc, runtime: &runtime::Handle, database_identity: Identity) { - let rt = runtime.clone(); - rt.spawn(async move { - let label = format!("[{database_identity}]"); - match timeout(Duration::from_secs(10), durability.close()).await { +pub(super) fn spawn_close(durability: Arc, runtime: &Handle, database_identity: Identity) { + let label = format!("[{database_identity}]"); + let runtime = runtime.clone(); + runtime.clone().spawn(async move { + match runtime.timeout(Duration::from_secs(10), durability.close()).await { Err(_elapsed) => { error!("{label} timeout waiting for durability shutdown"); } @@ -44,6 +44,7 @@ pub(super) fn spawn_close(durability: Arc, runtime: &runtime::Handle info!("{label} durability shut down at tx offset: {offset:?}"); } } + log::info!("closing spawn close"); }); } diff --git a/crates/core/src/db/persistence.rs b/crates/core/src/db/persistence.rs index 5b0daa5145c..c54a287bec2 100644 --- a/crates/core/src/db/persistence.rs +++ b/crates/core/src/db/persistence.rs @@ -4,7 +4,8 @@ use async_trait::async_trait; use spacetimedb_commitlog::SizeOnDisk; use spacetimedb_durability::{DurabilityExited, TxOffset}; use spacetimedb_paths::server::ServerDataDir; -use spacetimedb_snapshot::DynSnapshotRepo; +use spacetimedb_runtime::Handle; +use spacetimedb_snapshot::{DynSnapshotRepo, SnapshotStore}; use crate::{messages::control_db::Database, util::asyncify}; @@ -35,14 +36,16 @@ pub struct Persistence { /// Currently the expectation is that the reported size is the commitlog /// size only. pub disk_size: DiskSizeFn, + /// Optional snapshot store used during database restore. + pub snapshot_store: Option>, /// An optional [SnapshotWorker]. /// /// The current expectation is that snapshots are only enabled for /// persistent (as opposed to in-memory) databases. This is enforced by /// this type. pub snapshots: Option, - /// The tokio runtime onto which durability-related tasks shall be spawned. - pub runtime: tokio::runtime::Handle, + /// Runtime onto which durability-related tasks shall be spawned. + pub runtime: Handle, } impl Persistence { @@ -53,9 +56,20 @@ impl Persistence { snapshots: Option, runtime: tokio::runtime::Handle, ) -> Self { + Self::new_with_runtime(durability, disk_size, snapshots, Handle::tokio(runtime)) + } + + pub fn new_with_runtime( + durability: impl spacetimedb_durability::Durability + 'static, + disk_size: impl Fn() -> io::Result + Send + Sync + 'static, + snapshots: Option, + runtime: Handle, + ) -> Self { + let snapshot_store = snapshots.as_ref().map(SnapshotWorker::snapshot_store); Self { durability: Arc::new(durability), disk_size: Arc::new(disk_size), + snapshot_store, snapshots, runtime, } @@ -66,6 +80,13 @@ impl Persistence { self.snapshots.as_ref().map(|worker| worker.snapshot_repo()) } + /// If snapshot restore is enabled, get the [SnapshotStore] to read from. + pub fn snapshot_store(&self) -> Option> { + self.snapshot_store + .clone() + .or_else(|| self.snapshots.as_ref().map(SnapshotWorker::snapshot_store)) + } + /// Get the [TxOffset] reported as durable by the [Durability] impl. /// /// Returns `Ok(None)` if no offset is durable yet, and `Err(DurabilityExited)` @@ -91,12 +112,13 @@ impl Persistence { Option>, Option, Option, - Option, + Option, ) { this.map( |Self { durability, disk_size, + snapshot_store: _, snapshots, runtime, }| (Some(durability), Some(disk_size), snapshots, Some(runtime)), @@ -143,13 +165,15 @@ impl PersistenceProvider for LocalPersistenceProvider { async fn persistence(&self, database: &Database, replica_id: u64) -> anyhow::Result { let replica_dir = self.data_dir.replica(replica_id); let snapshot_dir = replica_dir.snapshots(); + let runtime = Handle::tokio_current(); let database_identity = database.database_identity; let snapshot_worker = asyncify(move || relational_db::open_snapshot_repo(snapshot_dir, database_identity, replica_id)) .await - .map(|repo| SnapshotWorker::new(repo, snapshot::Compression::Enabled))?; - let (durability, disk_size) = relational_db::local_durability(replica_dir, Some(&snapshot_worker)).await?; + .map(|repo| SnapshotWorker::new(repo, snapshot::Compression::Enabled, runtime.clone()))?; + let (durability, disk_size) = + relational_db::local_durability(replica_dir, runtime.clone(), Some(&snapshot_worker)).await?; tokio::spawn(relational_db::snapshot_watching_commitlog_compressor( snapshot_worker.subscribe(), @@ -161,8 +185,9 @@ impl PersistenceProvider for LocalPersistenceProvider { Ok(Persistence { durability, disk_size, + snapshot_store: Some(snapshot_worker.snapshot_store()), snapshots: Some(snapshot_worker), - runtime: tokio::runtime::Handle::current(), + runtime, }) } } diff --git a/crates/core/src/db/relational_db.rs b/crates/core/src/db/relational_db.rs index 9f041c92ccb..75efb0ad5ee 100644 --- a/crates/core/src/db/relational_db.rs +++ b/crates/core/src/db/relational_db.rs @@ -42,6 +42,7 @@ use spacetimedb_lib::ConnectionId; use spacetimedb_lib::Identity; use spacetimedb_paths::server::{ReplicaDir, SnapshotsPath}; use spacetimedb_primitives::*; +use spacetimedb_runtime::Handle; use spacetimedb_sats::memory_usage::MemoryUsage; use spacetimedb_sats::raw_identifier::RawIdentifier; use spacetimedb_sats::{AlgebraicType, AlgebraicValue, ProductType, ProductValue}; @@ -51,7 +52,7 @@ use spacetimedb_schema::schema::{ ColumnSchema, IndexSchema, RowLevelSecuritySchema, Schema, SequenceSchema, TableSchema, }; use spacetimedb_schema::table_name::TableName; -use spacetimedb_snapshot::{DynSnapshotRepo, ReconstructedSnapshot, SnapshotError, SnapshotRepository}; +use spacetimedb_snapshot::{DynSnapshotRepo, ReconstructedSnapshot, SnapshotError, SnapshotRepository, SnapshotStore}; use spacetimedb_table::indexes::RowPointer; use spacetimedb_table::page_pool::PagePool; use spacetimedb_table::table::{RowRef, TableScanIter}; @@ -99,7 +100,7 @@ pub struct RelationalDB { inner: Locking, durability: Option>, - durability_runtime: Option, + durability_runtime: Option, snapshot_worker: Option, row_count_fn: RowCountFn, @@ -278,10 +279,10 @@ impl RelationalDB { let start_time = std::time::Instant::now(); - let snapshot_repo = persistence.as_ref().and_then(|p| p.snapshot_repo()); + let snapshot_store = persistence.as_ref().and_then(|p| p.snapshot_store()); let inner = Self::restore_from_snapshot_or_bootstrap( database_identity, - snapshot_repo.as_deref(), + snapshot_store.as_deref(), durable_tx_offset, min_commitlog_offset, page_pool, @@ -472,7 +473,7 @@ impl RelationalDB { fn restore_from_snapshot_or_bootstrap( database_identity: Identity, - snapshot_repo: Option<&DynSnapshotRepo>, + snapshot_store: Option<&dyn SnapshotStore>, durable_tx_offset: Option, min_commitlog_offset: TxOffset, page_pool: PagePool, @@ -480,14 +481,14 @@ impl RelationalDB { // Try to load the `ReconstructedSnapshot` at `snapshot_offset`. fn try_load_snapshot( database_identity: &Identity, - snapshot_repo: &DynSnapshotRepo, + snapshot_store: &dyn SnapshotStore, snapshot_offset: TxOffset, page_pool: &PagePool, ) -> Result> { log::info!("[{database_identity}] DATABASE: restoring snapshot of tx_offset {snapshot_offset}"); let start = std::time::Instant::now(); - let snapshot = snapshot_repo + let snapshot = snapshot_store .read_snapshot(snapshot_offset, page_pool) .map_err(Box::new)?; @@ -553,11 +554,11 @@ impl RelationalDB { } } - if let Some((snapshot_repo, durable_tx_offset)) = snapshot_repo.zip(durable_tx_offset) { + if let Some((snapshot_store, durable_tx_offset)) = snapshot_store.zip(durable_tx_offset) { // Mark any newer snapshots as invalid, as the history past // `durable_tx_offset` may have been reset and thus diverge from // any snapshots taken earlier. - snapshot_repo + snapshot_store .invalidate_newer_snapshots(durable_tx_offset) .map_err(|e| RestoreSnapshotError::Invalidate { offset: durable_tx_offset, @@ -568,7 +569,7 @@ impl RelationalDB { // range `(min_commitlog_offset + 1)..=durable_tx_offset`. let mut upper_bound = durable_tx_offset; loop { - let Some(snapshot_offset) = snapshot_repo + let Some(snapshot_offset) = snapshot_store .latest_snapshot_older_than(upper_bound) .map_err(Box::new)? else { @@ -578,7 +579,7 @@ impl RelationalDB { log::debug!("snapshot_offset={snapshot_offset} min_commitlog_offset={min_commitlog_offset}"); break; } - match try_load_snapshot(&database_identity, snapshot_repo, snapshot_offset, &page_pool) { + match try_load_snapshot(&database_identity, snapshot_store, snapshot_offset, &page_pool) { Ok(snapshot) if snapshot.database_identity != database_identity => { return Err(RestoreSnapshotError::IdentityMismatch { expected: database_identity, @@ -594,7 +595,7 @@ impl RelationalDB { // Newly created snapshots should not depend on it. if !is_transient_error(&e) { log::info!("invalidating bad snapshot at {snapshot_offset}"); - snapshot_repo.invalidate_snapshot(snapshot_offset).map_err(|e| { + snapshot_store.invalidate_snapshot(snapshot_offset).map_err(|e| { RestoreSnapshotError::Invalidate { offset: snapshot_offset, source: Box::new(e), @@ -1669,9 +1670,9 @@ pub type LocalDurability = Arc>; /// of the commitlog. pub async fn local_durability( replica_dir: ReplicaDir, + runtime: Handle, snapshot_worker: Option<&SnapshotWorker>, ) -> Result<(LocalDurability, DiskSizeFn), DBError> { - let rt = tokio::runtime::Handle::current(); let on_new_segment = snapshot_worker.map(|snapshot_worker| { let snapshot_worker = snapshot_worker.clone(); Arc::new(move || { @@ -1683,7 +1684,7 @@ pub async fn local_durability( let local = asyncify(move || { durability::Local::open( replica_dir.clone(), - rt, + runtime, <_>::default(), // Give the durability a handle to request a new snapshot run, // which it will send down whenever we rotate commitlog segments. @@ -1949,19 +1950,23 @@ pub mod tests_utils { ) -> Result<(RelationalDB, Arc>), DBError> { let snapshots = want_snapshot_repo .then(|| { - open_snapshot_repo(root.snapshots(), db_identity, replica_id) - .map(|repo| SnapshotWorker::new(repo, snapshot::Compression::Disabled)) + open_snapshot_repo(root.snapshots(), db_identity, replica_id).map(|repo| { + SnapshotWorker::new(repo, snapshot::Compression::Disabled, Handle::tokio(rt.clone())) + }) }) .transpose()?; - let (local, disk_size_fn) = rt.block_on(local_durability(root.clone(), snapshots.as_ref()))?; + let runtime = Handle::tokio(rt.clone()); + let (local, disk_size_fn) = + rt.block_on(local_durability(root.clone(), runtime.clone(), snapshots.as_ref()))?; let history = local.as_history(); let persistence = Persistence { durability: local.clone(), disk_size: disk_size_fn, + snapshot_store: snapshots.as_ref().map(SnapshotWorker::snapshot_store), snapshots, - runtime: rt, + runtime, }; let (db, _) = RelationalDB::open( @@ -2074,17 +2079,21 @@ pub mod tests_utils { ) -> Result<(RelationalDB, Arc>), DBError> { let snapshots = want_snapshot_repo .then(|| { - open_snapshot_repo(root.snapshots(), Identity::ZERO, 0) - .map(|repo| SnapshotWorker::new(repo, snapshot::Compression::Disabled)) + open_snapshot_repo(root.snapshots(), Identity::ZERO, 0).map(|repo| { + SnapshotWorker::new(repo, snapshot::Compression::Disabled, Handle::tokio(rt.clone())) + }) }) .transpose()?; - let (local, disk_size_fn) = rt.block_on(local_durability(root.clone(), snapshots.as_ref()))?; + let runtime = Handle::tokio(rt.clone()); + let (local, disk_size_fn) = + rt.block_on(local_durability(root.clone(), runtime.clone(), snapshots.as_ref()))?; let history = local.as_history(); let persistence = Persistence { durability: local.clone(), disk_size: disk_size_fn, + snapshot_store: snapshots.as_ref().map(SnapshotWorker::snapshot_store), snapshots, - runtime: rt, + runtime, }; let db = Self::open_db(history, Some(persistence), None, 0)?; diff --git a/crates/core/src/db/snapshot.rs b/crates/core/src/db/snapshot.rs index 26e3d8373cf..ac792ee0293 100644 --- a/crates/core/src/db/snapshot.rs +++ b/crates/core/src/db/snapshot.rs @@ -14,10 +14,11 @@ use prometheus::{Histogram, IntGauge}; use spacetimedb_datastore::locking_tx_datastore::{committed_state::CommittedState, datastore::Locking}; use spacetimedb_durability::TxOffset; use spacetimedb_lib::Identity; -use spacetimedb_snapshot::{CompressionStats, DynSnapshotRepo}; +use spacetimedb_snapshot::{BoxedPendingSnapshot, CompressionStats, DynSnapshotRepo, SnapshotRepo, SnapshotStore}; use tokio::sync::watch; -use crate::{util::asyncify, worker_metrics::WORKER_METRICS}; +use crate::worker_metrics::WORKER_METRICS; +use spacetimedb_runtime::Handle; pub type SnapshotDatabaseState = Arc>; @@ -61,6 +62,7 @@ pub struct SnapshotWorker { snapshot_created: watch::Sender, request_snapshot: mpsc::UnboundedSender, snapshot_repository: Arc, + snapshot_store: Arc, } impl SnapshotWorker { @@ -69,29 +71,37 @@ impl SnapshotWorker { /// The handle is only partially initialized, as it is lacking the /// [SnapshotDatabaseState]. This allows control code to [Self::subscribe] /// to future snapshots before handing off the worker to the database. - pub fn new(snapshot_repository: Arc, compression: Compression) -> Self { - let database = snapshot_repository.database_identity(); - let latest_snapshot = snapshot_repository.latest_snapshot().ok().flatten().unwrap_or(0); + pub fn new(snapshot_repo: Arc, compression: Compression, rt: Handle) -> Self + where + R: SnapshotRepo + 'static, + { + let snapshot_store: Arc = snapshot_repo.clone(); + let snapshot_repo: Arc = snapshot_repo; + let database = snapshot_repo.database_identity(); + let latest_snapshot = snapshot_repo.latest_snapshot().ok().flatten().unwrap_or(0); let (snapshot_created, _) = watch::channel(latest_snapshot); let (request_tx, request_rx) = mpsc::unbounded(); let actor = SnapshotWorkerActor { snapshot_requests: request_rx, - snapshot_repo: snapshot_repository.clone(), + snapshot_repo: snapshot_repo.clone(), snapshot_created: snapshot_created.clone(), metrics: SnapshotMetrics::new(database), + rt: rt.clone(), compression: compression.is_enabled().then(|| Compressor { - snapshot_repo: snapshot_repository.clone(), + snapshot_repo: snapshot_repo.clone(), metrics: CompressionMetrics::new(database), stats: <_>::default(), + rt: rt.clone(), }), }; - tokio::spawn(actor.run()); + rt.spawn(actor.run()); Self { snapshot_created, request_snapshot: request_tx, - snapshot_repository, + snapshot_repository: snapshot_repo, + snapshot_store, } } @@ -110,6 +120,11 @@ impl SnapshotWorker { self.snapshot_repository.clone() } + /// Get the snapshot store this worker is operating on. + pub fn snapshot_store(&self) -> Arc { + self.snapshot_store.clone() + } + /// Request a snapshot to be taken. /// /// The snapshot will be taken at some point in the future. @@ -169,6 +184,7 @@ struct SnapshotWorkerActor { snapshot_repo: Arc, snapshot_created: watch::Sender, metrics: SnapshotMetrics, + rt: Handle, compression: Option, } @@ -220,21 +236,24 @@ impl SnapshotWorkerActor { let inner_timer = self.metrics.snapshot_timing_inner.clone(); let snapshot_repo = self.snapshot_repo.clone(); + let runtime = self.rt.clone(); let database_identity = self.snapshot_repo.database_identity(); - let maybe_snapshot = asyncify(move || { - let _timer = inner_timer.start_timer(); - Locking::take_snapshot_internal(&state, snapshot_repo.as_ref()) - }) - .await - .with_context(|| format!("error capturing snapshot of database {}", database_identity))?; - let (snapshot_offset, unflushed_snapshot) = maybe_snapshot.with_context(|| { - format!( - "refusing to take snapshot of database {} at TX offset -1", - database_identity - ) - })?; + let maybe_snapshot = runtime + .spawn_blocking(move || { + let _timer = inner_timer.start_timer(); + Locking::take_snapshot_internal(&state, snapshot_repo.as_ref()) + }) + .await + .with_context(|| format!("error capturing snapshot of database {}", database_identity))? + .with_context(|| { + format!( + "refusing to take snapshot of database {} at TX offset -1", + database_identity + ) + })?; + let (snapshot_offset, unflushed_snapshot) = maybe_snapshot; self.metrics .snapshot_timing_fsync .observe_closure_duration(|| unflushed_snapshot.sync_all())?; @@ -310,6 +329,7 @@ struct Compressor { snapshot_repo: Arc, metrics: CompressionMetrics, stats: Option, + rt: Handle, } impl Compressor { @@ -341,15 +361,17 @@ impl Compressor { let range = start..latest_snapshot; let mut stats = self.stats.take().unwrap_or_default(); - let (mut stats, res) = asyncify({ - let range = range.clone(); - move || { - let _timer = inner_timer.start_timer(); - let res = snapshot_repo.compress_snapshots(&mut stats, range); - (stats, res) - } - }) - .await; + let rt = self.rt.clone(); + let (mut stats, res) = rt + .spawn_blocking({ + let range = range.clone(); + move || { + let _timer = inner_timer.start_timer(); + let res = snapshot_repo.compress_snapshots(&mut stats, range); + (stats, res) + } + }) + .await; let elapsed = Duration::from_secs_f64(timer.stop_and_record()); self.metrics.report_and_reset(&mut stats); // Store stats for reuse. diff --git a/crates/core/src/subscription/module_subscription_actor.rs b/crates/core/src/subscription/module_subscription_actor.rs index d748fdd09ab..742e2eddf83 100644 --- a/crates/core/src/subscription/module_subscription_actor.rs +++ b/crates/core/src/subscription/module_subscription_actor.rs @@ -2102,8 +2102,9 @@ mod tests { Some(Persistence { durability: durability.clone(), disk_size: Arc::new(|| Ok(<_>::default())), + snapshot_store: None, snapshots: None, - runtime: rt, + runtime: spacetimedb_runtime::Handle::tokio(rt), }), None, 0, diff --git a/crates/datastore/src/locking_tx_datastore/datastore.rs b/crates/datastore/src/locking_tx_datastore/datastore.rs index e9d67103b16..254f44c4e01 100644 --- a/crates/datastore/src/locking_tx_datastore/datastore.rs +++ b/crates/datastore/src/locking_tx_datastore/datastore.rs @@ -38,7 +38,7 @@ use spacetimedb_schema::{ reducer_name::ReducerName, schema::{ColumnSchema, IndexSchema, SequenceSchema, TableSchema}, }; -use spacetimedb_snapshot::{BoxedPendingSnapshot, DynSnapshotRepo, ReconstructedSnapshot}; +use spacetimedb_snapshot::{BoxedPendingSnapshot, DynSnapshotRepo, ReconstructedSnapshot, SnapshotStore}; use spacetimedb_table::{ indexes::RowPointer, page_pool::PagePool, @@ -259,6 +259,28 @@ impl Locking { Ok(Some((tx_offset, unflushed_snapshot))) } + pub fn take_snapshot_store_internal( + committed_state: &RwLock, + store: &dyn SnapshotStore, + ) -> Result> { + let mut committed_state = committed_state.write(); + let Some(tx_offset) = committed_state.next_tx_offset.checked_sub(1) else { + return Ok(None); + }; + + log::info!( + "Capturing snapshot of database {:?} at TX offset {}", + store.database_identity(), + tx_offset, + ); + + let (mut tables, blob_store) = committed_state.persistent_tables_and_blob_store(); + store + .capture_snapshot(&mut tables, blob_store, tx_offset) + .map(Some) + .map_err(Into::into) + } + /// Returns a list over all the currently connected clients, /// reading from the `st_clients` system table. pub fn connected_clients<'a>( @@ -2824,6 +2846,38 @@ pub(crate) mod tests { Ok(()) } + #[test] + fn test_try_begin_mut_tx_reports_writer_contention() -> ResultTest<()> { + let datastore = get_datastore()?; + let tx = begin_mut_tx(&datastore); + assert!(datastore + .try_begin_mut_tx(IsolationLevel::Serializable, Workload::ForTests) + .is_none()); + let _ = datastore.rollback_mut_tx(tx); + + let tx = datastore + .try_begin_mut_tx(IsolationLevel::Serializable, Workload::ForTests) + .expect("write lock should be available after rollback"); + let _ = datastore.rollback_mut_tx(tx); + Ok(()) + } + + #[test] + fn test_try_begin_mut_tx_reports_read_contention() -> ResultTest<()> { + let datastore = get_datastore()?; + let tx = begin_tx(&datastore); + assert!(datastore + .try_begin_mut_tx(IsolationLevel::Serializable, Workload::ForTests) + .is_none()); + let _ = datastore.release_tx(tx); + + let tx = datastore + .try_begin_mut_tx(IsolationLevel::Serializable, Workload::ForTests) + .expect("write lock should be available after read release"); + let _ = datastore.rollback_mut_tx(tx); + Ok(()) + } + #[test] fn test_scheduled_table_insert_and_update() -> ResultTest<()> { // Build the minimal schema that is a valid scheduler table. diff --git a/crates/dst/Cargo.toml b/crates/dst/Cargo.toml new file mode 100644 index 00000000000..c3e2b3ea519 --- /dev/null +++ b/crates/dst/Cargo.toml @@ -0,0 +1,33 @@ +[package] +name = "spacetimedb-dst" +version.workspace = true +edition.workspace = true +license-file = "LICENSE" +description = "Deterministic simulation testing utilities for SpacetimeDB crates" +rust-version.workspace = true + +[lints] +workspace = true + +[[bin]] +name = "spacetimedb-dst" +path = "src/main.rs" +bench = false + +[dependencies] +anyhow.workspace = true +clap.workspace = true +futures-util.workspace = true +spacetimedb-datastore = { workspace = true, features = ["test"] } +spacetimedb_core = { package = "spacetimedb-core", path = "../core", version = "=2.2.0", features = ["test"] } +spacetimedb-commitlog = { workspace = true, features = ["test"] } +spacetimedb_durability = { package = "spacetimedb-durability", path = "../durability", version = "=2.2.0", features = ["test"] } +spacetimedb-lib.workspace = true +spacetimedb-snapshot.workspace = true +spacetimedb-primitives.workspace = true +spacetimedb-runtime = { workspace = true, features = ["simulation"] } +spacetimedb-sats.workspace = true +spacetimedb-schema = { workspace = true, features = ["test"] } +spacetimedb-table.workspace = true +tracing.workspace = true +tracing-subscriber.workspace = true diff --git a/crates/dst/README.md b/crates/dst/README.md new file mode 100644 index 00000000000..e9c756a5646 --- /dev/null +++ b/crates/dst/README.md @@ -0,0 +1,227 @@ +# `spacetimedb-dst` + +Deterministic simulation testing for SpacetimeDB components. + +DST is not a generic random fuzzer. It is a seed-replayable framework for +generating meaningful SpacetimeDB histories, executing them against real +implementation paths, and checking semantic properties while the run is still +in progress. + +## First Principles + +- A failing run must be reproducible from target, scenario, seed, run budget, + and fault profile. Use `--max-interactions` for exact replay; `--duration` is + a wall-clock soak budget and may stop at a different step count on another + machine or runtime. +- Workloads describe legal but stressful user behavior. They should not depend + on target internals. +- Targets execute interactions against real SpacetimeDB code. +- Properties check externally observable behavior, preferably against a simple + model or a replayed durable history. +- Generation, execution, and property checking stay separate so failures are + diagnosable as workload bugs, target bugs, or weak assertions. +- Runs stream interactions instead of materializing a full plan by default. +- Fault injection is explicit, configurable, and summarized in the outcome. +- Shared probability and weighting logic belongs in `workload::strategy`, not + ad hoc scenario code. + +## Current Architecture + +The CLI selects a target, scenario, seed, budget, and fault profile. The shared +runner pulls one interaction at a time from a source, sends it to the target, +and asks the property runtime to observe the result. + +```text +CLI -> TargetDescriptor -> WorkloadSource -> TargetEngine -> Observation + \-> StreamingProperties -> Outcome +``` + +The core contracts are: + +- `WorkloadSource`: deterministic pull-based interaction stream. +- `TargetEngine`: target-specific execution and outcome collection. +- `StreamingProperties`: reusable property checks over observations and target + accessors. + +## Client Model + +DST workloads use shared logical client IDs rather than target-owned ad hoc +connection numbers. A `ClientId` is a stable actor in the generated history; a +`SessionId` is one live connection/session for that actor. A single client can +own multiple active sessions, which matters for reconnect, multi-tab, and future +replication traffic. Targets translate those IDs into their own handles: + +- `relational-db-commitlog` maps `SessionId` to direct write/read transaction + slots. +- future replication targets can map `SessionId` plus endpoint/node IDs to a + client connection routed through the simulated network. + +Concrete handles stay target-owned. Shared workloads should carry logical +identity and lifecycle intent, not `RelTx`, websocket handles, or target-specific +connection objects. + +## Workload Composition + +DST workloads use three building blocks: + +- **Source:** emits a deterministic stream of interactions. +- **Profile:** configures weights, schema shape, and generation policy. +- **Layer:** wraps a source and adds lifecycle, fault, or cross-cutting + interactions. + +`table_ops` is the base table-transaction workload. `commitlog_ops` composes it +and injects durability lifecycle operations such as sync, close/reopen, dynamic +table create/migrate/drop, and replay checks. + +Use this rule of thumb: + +- Add a new profile when the interaction language is unchanged and only weights + or schema shape differ. +- Add a new layer when you are adding lifecycle behavior around an existing + source. +- Add a new workload family only when the interaction vocabulary is genuinely + different. + +## Table Operation Semantics + +The table workload keeps the executable operation language small. Similar +cases converge into physical operations such as `InsertRows`, `DeleteRows`, and +`BeginTx`; the generated interaction also carries a case label for coverage and +debug output. + +Correctness does not come from that label. The property runtime asks its model +what the physical operation should do: + +- inserting fresh rows should mutate the table +- inserting an exact visible row should be an idempotent no-op +- inserting an existing primary id with a different payload should report a + unique-key error +- deleting visible rows should mutate the table +- deleting absent rows should report a missing-row error +- beginning or writing behind another writer should report a write conflict +- query operations (`PointLookup`, `PredicateCount`, `RangeScan`, `FullScan`) + should match the model-visible state + +The case label still matters for summaries. It lets a run report that it hit +`ExactDuplicateInsert` or `UniqueKeyConflictInsert`, without teaching the target +or properties to trust generator-provided expectations. + +## Current Targets + +- `relational-db-commitlog`: runs table and commitlog lifecycle interactions + against `RelationalDB`, local durability, dynamic schema operations, + close/reopen, and replay-from-history checks. + +## Properties + +Properties live in `src/properties.rs` and are selected by target. +Table-oriented properties use `TargetPropertyAccess` so the property runtime can +ask a target for rows, counts, lookups, and range scans without knowing target +storage internals. + +Current property families include: + +- insert/select and delete/select checks +- observed error vs model-predicted error matching +- model-predicted no-op checks +- point lookup, predicate count, range scan, and full scan vs the table oracle +- NoREC-style optimizer-vs-direct checks +- TLP-style true/false/null partition checks +- index range exclusion checks +- banking mirror-table invariants +- dynamic migration auto-increment checks +- durable replay state vs the oracle committed model + +## Fault Injection + +`relational-db-commitlog` can wrap the in-memory commitlog repo in +`BuggifiedRepo`. Fault decisions are deterministic from the run seed and +summarized in the final outcome. + +Profiles: + +- `off`: no injected disk behavior. +- `light`: latency and occasional short I/O. +- `default`: stronger latency and short I/O pressure. +- `aggressive`: higher latency and short I/O rates. I/O error hooks exist but + are currently disabled in profile-driven runs because local durability does + not yet classify those errors as recoverable target outcomes. + +## Running + +Fast local run: + +```bash +cargo run -p spacetimedb-dst -- run --target relational-db-commitlog --seed 42 --max-interactions 200 +``` + +Scenario examples: + +```bash +cargo run -p spacetimedb-dst -- run --target relational-db-commitlog --scenario banking --duration 5m +cargo run -p spacetimedb-dst -- run --target relational-db-commitlog --scenario indexed-ranges --duration 5m +``` + +Run with commitlog faults: + +```bash +cargo run -p spacetimedb-dst -- run \ + --target relational-db-commitlog \ + --seed 42 \ + --max-interactions 400 \ + --commitlog-fault-profile default +``` + +Trace every interaction: + +```bash +RUST_LOG=trace cargo run -p spacetimedb-dst -- run --target relational-db-commitlog --duration 5m +``` + +## Run Budgets + +Prefer `--max-interactions` when reporting or replaying a failure. It is the +deterministic interaction budget, so target, scenario, seed, interaction count, +and fault profile are enough to rerun the same generated stream. + +Use `--duration` for local soaks. It is intentionally wall-clock based, so it +can stop after a different number of interactions if host speed, logging, or +runtime behavior changes. + +## Reading The Code + +Start here: + +- `src/core/mod.rs`: source, engine, property, and runner traits. +- `src/workload/table_ops`: table interaction language, generation model, and + scenarios. +- `src/workload/commitlog_ops`: lifecycle layer over table workloads. +- `src/sim/`: local executor and deterministic-decision shim. +- `src/properties.rs`: property catalog and oracle/model checks. +- `src/targets/relational_db_commitlog.rs`: target adapter for RelationalDB, + commitlog durability, fault injection, close/reopen, and replay. +- `src/targets/buggified_repo.rs`: deterministic disk-like fault layer. + +## Adding A New Target + +1. Add a target engine in `src/targets/.rs`. +2. Reuse an existing workload family or add `src/workload//`. +3. Return observations that are rich enough for properties to validate behavior. +4. Plug target-specific properties through `PropertyRuntime`. +5. Add a `TargetDescriptor` in `src/targets/descriptor.rs`. +6. Register the target in CLI `TargetKind`. + +## Current Gaps + +- No structured trace/replay format yet. +- No shrinker yet; seed replay is the current reproduction mechanism. +- Sometimes-property reporting is still outcome-counter based, not a stable + property-event catalog. +- The local `sim` shim is not a real simulator yet. It owns executor setup and + deterministic fault decisions so future simulator work has one boundary. +- The current `RelationalDB` target drives open read snapshots to release before + starting writes, because beginning a write behind an open read snapshot can + block in this target shape. Interleaved read/write snapshot histories should + come back once the target models that lock behavior explicitly. +- Runtime-boundary work for scheduler, time, network, filesystem, and lower + randomness sources is still future work. diff --git a/crates/dst/src/client.rs b/crates/dst/src/client.rs new file mode 100644 index 00000000000..84b215a7198 --- /dev/null +++ b/crates/dst/src/client.rs @@ -0,0 +1,70 @@ +//! Logical client and topology identifiers shared by DST workloads and targets. +//! +//! These IDs are part of the generated workload language. Targets translate +//! them into concrete handles such as direct database transaction slots, +//! `ClientConnection`s, websocket sessions, or simulated-node connections. + +use std::fmt; + +/// Stable logical client identity within one DST run. +/// +/// A `ClientId` is an actor/user identity, not a live network connection. One +/// client may own zero, one, or many [`SessionId`]s at the same time. +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub struct ClientId(u32); + +impl ClientId { + pub const ZERO: Self = Self(0); + + pub const fn new(raw: u32) -> Self { + Self(raw) + } +} + +impl fmt::Display for ClientId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "client{}", self.0) + } +} + +/// Logical live connection/session for a client. +/// +/// Current single-process targets use `SessionId` anywhere old DST code said +/// "connection": transaction slots, read snapshots, reducer-call handles, and +/// property observations. A target translates this logical session into its +/// concrete handle, such as a `RelTx` slot or `ClientConnection`. +/// +/// The `generation` field is the per-client session ordinal. Workloads can keep +/// several generations active concurrently to model one client with multiple +/// open connections, or allocate a later generation after a reconnect. +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub struct SessionId { + pub client: ClientId, + pub generation: u32, +} + +impl SessionId { + pub const ZERO: Self = Self::new(ClientId::ZERO, 0); + + pub const fn new(client: ClientId, generation: u32) -> Self { + Self { client, generation } + } + + /// Compatibility helper for today's fixed-size session pools. + /// + /// A run with `N` connections starts as one logical client with `N` + /// sessions: `client0/session0`, `client0/session1`, ... + pub(crate) const fn from_index(index: usize) -> Self { + Self::new(ClientId::ZERO, index as u32) + } + + pub(crate) const fn as_index(self) -> usize { + self.generation as usize + } +} + +impl fmt::Display for SessionId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}.session{}", self.client, self.generation) + } +} diff --git a/crates/dst/src/config.rs b/crates/dst/src/config.rs new file mode 100644 index 00000000000..1f37e217fb8 --- /dev/null +++ b/crates/dst/src/config.rs @@ -0,0 +1,98 @@ +//! Shared run-budget configuration for DST targets. + +use std::time::{Duration, Instant}; + +/// Storage fault-injection profile for commitlog and snapshot wrappers. +/// +/// These are not CLI options yet; they are programmatic knobs for targeted +/// fault-injection tests. +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub(crate) enum CommitlogFaultProfile { + /// No faults injected regardless of buggify state. + Off, + /// Low probability latency and short I/O only. + Light, + /// Moderate-latency and short I/O only. + #[default] + Default, + /// Heavy-latency and short I/O only. + Aggressive, +} + +/// Common stop conditions for generated DST runs. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct RunConfig { + /// Hard cap on generated interactions. `None` means no interaction budget. + /// + /// This is the preferred budget for exact seed replay: the same target, + /// scenario, seed, max-interactions value, and fault profile should produce + /// the same generated interaction stream. + pub max_interactions: Option, + /// Wall-clock duration budget in milliseconds. `None` means no time budget. + /// + /// Duration runs are useful as local soaks, but the exact stop step can vary + /// with host speed and runtime behavior. Use `max_interactions` when a + /// failure needs precise replay. + pub max_duration_ms: Option, +} + +impl Default for RunConfig { + fn default() -> Self { + Self { + max_interactions: None, + max_duration_ms: None, + } + } +} + +impl RunConfig { + pub fn with_max_interactions(max_interactions: usize) -> Self { + Self { + max_interactions: Some(max_interactions), + max_duration_ms: None, + } + } + + pub fn with_duration_spec(duration: &str) -> anyhow::Result { + Ok(Self { + max_interactions: None, + max_duration_ms: Some(parse_duration_spec(duration)?.as_millis() as u64), + }) + } + + /// Return the wall-clock deadline for duration-budgeted runs. + /// + /// This intentionally uses `std::time::Instant`, not simulated time. DST + /// duration budgets are a harness stop condition rather than part of the + /// simulated system under test. + pub fn deadline(&self) -> Option { + self.max_duration_ms + .map(Duration::from_millis) + .map(|duration| Instant::now() + duration) + } + + pub fn max_interactions_or_default(&self, default: usize) -> usize { + self.max_interactions.unwrap_or(default) + } +} + +pub fn parse_duration_spec(spec: &str) -> anyhow::Result { + let spec = spec.trim(); + if spec.is_empty() { + anyhow::bail!("duration spec cannot be empty"); + } + + let split_at = spec + .find(|ch: char| !ch.is_ascii_digit()) + .ok_or_else(|| anyhow::anyhow!("duration spec missing unit: {spec}"))?; + let (digits, unit) = spec.split_at(split_at); + let value: u64 = digits.parse()?; + + match unit { + "ms" => Ok(Duration::from_millis(value)), + "s" => Ok(Duration::from_secs(value)), + "m" => Ok(Duration::from_secs(value.saturating_mul(60))), + "h" => Ok(Duration::from_secs(value.saturating_mul(60 * 60))), + _ => anyhow::bail!("unsupported duration unit: {unit}"), + } +} diff --git a/crates/dst/src/core/mod.rs b/crates/dst/src/core/mod.rs new file mode 100644 index 00000000000..400c132a35f --- /dev/null +++ b/crates/dst/src/core/mod.rs @@ -0,0 +1,264 @@ +//! Core abstractions for pluggable DST workloads, engines, and properties. + +use std::{ + any::Any, + fmt::Debug, + future::Future, + panic::{self, AssertUnwindSafe}, +}; + +use crate::config::RunConfig; +use futures_util::FutureExt; + +/// Pull-based deterministic interaction source. +pub trait WorkloadSource { + type Interaction; + + fn next_interaction(&mut self) -> Option; + fn request_finish(&mut self); +} + +/// Target execution contract over a workload interaction stream. +pub trait TargetEngine { + type Observation; + type Outcome; + type Error; + + fn execute_interaction<'a>( + &'a mut self, + interaction: &'a I, + ) -> impl Future> + 'a; + fn finish(&mut self); + fn collect_outcome<'a>(&'a mut self) -> impl Future> + 'a; +} + +/// Property runtime contract for the shared streaming runner. +pub trait StreamingProperties +where + E: TargetEngine, +{ + fn observe(&mut self, engine: &E, interaction: &I, observation: &O) -> Result<(), String>; + fn finish(&mut self, engine: &E, outcome: &E::Outcome) -> Result<(), String>; +} + +/// Shared streaming runner with property orchestration. +pub async fn run_streaming( + mut source: S, + mut engine: E, + mut properties: P, + cfg: RunConfig, +) -> anyhow::Result +where + I: Clone + Debug, + S: WorkloadSource, + E: TargetEngine, + P: StreamingProperties, +{ + let deadline = cfg.deadline(); + let mut step = 0usize; + loop { + if deadline.is_some_and(|d| std::time::Instant::now() >= d) { + source.request_finish(); + } + let Some(interaction) = source.next_interaction() else { + break; + }; + let execution = guard_target("execute_interaction", step, Some(&interaction), || { + engine.execute_interaction(&interaction) + }) + .await + .map_err(|e| anyhow::anyhow!("property violation at step {step}: {e}"))?; + let observation = execution.map_err(|e| anyhow::anyhow!("interaction execution failed at step {step}: {e}"))?; + properties + .observe(&engine, &interaction, &observation) + .map_err(|e| anyhow::anyhow!("property violation at step {step}: {e}"))?; + step = step.saturating_add(1); + } + guard_target("finish", step, Option::<&I>::None, || async { + engine.finish(); + }) + .await + .map_err(|e| anyhow::anyhow!("property violation at finish: {e}"))?; + let outcome = guard_target("collect_outcome", step, Option::<&I>::None, || engine.collect_outcome()) + .await + .map_err(|e| anyhow::anyhow!("property violation while collecting outcome: {e}"))??; + properties + .finish(&engine, &outcome) + .map_err(|e| anyhow::anyhow!("property violation at finish: {e}"))?; + Ok(outcome) +} + +async fn guard_target( + phase: &'static str, + step: usize, + interaction: Option<&I>, + make_future: impl FnOnce() -> Fut, +) -> Result +where + I: Debug, + Fut: Future, +{ + let future = panic::catch_unwind(AssertUnwindSafe(make_future)) + .map_err(|payload| not_crash_error(phase, step, interaction, &payload))?; + AssertUnwindSafe(future) + .catch_unwind() + .await + .map_err(|payload| not_crash_error(phase, step, interaction, &payload)) +} + +fn not_crash_error( + phase: &'static str, + step: usize, + interaction: Option<&I>, + payload: &Box, +) -> String { + let payload = panic_payload_to_string(payload); + match interaction { + Some(interaction) => { + format!("[NotCrash] target panicked during {phase} at step {step}: interaction={interaction:?}, payload={payload}") + } + None => format!("[NotCrash] target panicked during {phase} after step {step}: payload={payload}"), + } +} + +fn panic_payload_to_string(payload: &Box) -> String { + if let Some(message) = payload.downcast_ref::<&'static str>() { + (*message).to_string() + } else if let Some(message) = payload.downcast_ref::() { + message.clone() + } else { + "".to_string() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[derive(Clone, Debug)] + struct TestInteraction; + + struct SingleStepSource { + emitted: bool, + } + + impl SingleStepSource { + fn new() -> Self { + Self { emitted: false } + } + } + + impl WorkloadSource for SingleStepSource { + type Interaction = TestInteraction; + + fn next_interaction(&mut self) -> Option { + if self.emitted { + None + } else { + self.emitted = true; + Some(TestInteraction) + } + } + + fn request_finish(&mut self) {} + } + + #[derive(Clone, Copy, Debug, Eq, PartialEq)] + enum PanicPhase { + Execute, + Finish, + CollectOutcome, + } + + struct PanicEngine { + phase: PanicPhase, + } + + impl PanicEngine { + fn new(phase: PanicPhase) -> Self { + Self { phase } + } + } + + impl TargetEngine for PanicEngine { + type Observation = (); + type Outcome = (); + type Error = String; + + fn execute_interaction<'a>( + &'a mut self, + _interaction: &'a TestInteraction, + ) -> impl Future> + 'a { + async move { + if self.phase == PanicPhase::Execute { + panic!("execute panic"); + } + Ok(()) + } + } + + fn finish(&mut self) { + if self.phase == PanicPhase::Finish { + panic!("finish panic"); + } + } + + fn collect_outcome<'a>(&'a mut self) -> impl Future> + 'a { + async move { + if self.phase == PanicPhase::CollectOutcome { + panic!("collect panic"); + } + Ok(()) + } + } + } + + struct NoopProperties; + + impl StreamingProperties for NoopProperties { + fn observe( + &mut self, + _engine: &PanicEngine, + _interaction: &TestInteraction, + _observation: &(), + ) -> Result<(), String> { + Ok(()) + } + + fn finish(&mut self, _engine: &PanicEngine, _outcome: &()) -> Result<(), String> { + Ok(()) + } + } + + #[test] + fn not_crash_catches_execute_panic() { + assert_not_crash_error(PanicPhase::Execute, "execute_interaction", "execute panic"); + } + + #[test] + fn not_crash_catches_finish_panic() { + assert_not_crash_error(PanicPhase::Finish, "finish", "finish panic"); + } + + #[test] + fn not_crash_catches_collect_outcome_panic() { + assert_not_crash_error(PanicPhase::CollectOutcome, "collect_outcome", "collect panic"); + } + + fn assert_not_crash_error(phase: PanicPhase, expected_phase: &str, expected_payload: &str) { + let mut runtime = crate::sim::Runtime::new(0).expect("runtime"); + let err = runtime + .block_on(run_streaming( + SingleStepSource::new(), + PanicEngine::new(phase), + NoopProperties, + RunConfig::with_max_interactions(1), + )) + .unwrap_err() + .to_string(); + + assert!(err.contains("[NotCrash]")); + assert!(err.contains(expected_phase)); + assert!(err.contains(expected_payload)); + } +} diff --git a/crates/dst/src/lib.rs b/crates/dst/src/lib.rs new file mode 100644 index 00000000000..cfebd1a113d --- /dev/null +++ b/crates/dst/src/lib.rs @@ -0,0 +1,47 @@ +//! Deterministic simulation testing utilities for SpacetimeDB crates. +//! +//! Public surface is intentionally narrow and centered on the CLI: +//! +//! - [`client`] for logical client/session identifiers, +//! - [`config`] for run budgets, +//! - [`properties`] for reusable semantic checks, +//! - [`workload`] for scenario identifiers, +//! - [`targets`] for the executable relational-db adapter. +//! +//! ## DST principles +//! +//! 1. Every generated choice comes from a simulator-provided deterministic +//! source. A failing run should be replayable from the printed seed and CLI +//! arguments. Use `--max-interactions` for exact replay; duration budgets are +//! wall-clock soak limits. +//! 2. Workloads describe legal but stressful user behavior. Targets may add +//! faults and lifecycle disruption, but the generator should not depend on +//! target internals. +//! 3. Oracles should check observable state, not merely absence of panics. When +//! possible, compare the target against a simple model. +//! 4. Keep generation, execution, and property checking separate. This makes it +//! clear whether a failure came from an invalid workload, a target bug, or a +//! weak assertion. +//! 5. Prefer streaming state machines over precomputed traces. DST runs should +//! scale by budget and duration without materializing the whole workload. +//! 6. Fault injection must be explicit, configurable, and summarized in the run +//! output. Profiles should start with recoverable API-level behavior before +//! introducing crash or corruption semantics. +//! 7. Shared randomness, weighting, and sampling helpers belong in the +//! workload strategy module, not in ad hoc target or scenario code. + +/// Logical client/session identifiers shared by workloads and targets. +pub mod client; +/// Shared run-budget configuration for DST targets. +pub mod config; +/// Core traits/runners for pluggable workloads and targets. +pub mod core; +/// Reusable semantic properties and oracle-model checks. +pub(crate) mod properties; +mod schema; +/// Local executor and deterministic-decision shim. +pub mod sim; +/// Concrete simulator targets. +pub mod targets; +/// Shared workload generators reused by multiple targets. +pub mod workload; diff --git a/crates/dst/src/main.rs b/crates/dst/src/main.rs new file mode 100644 index 00000000000..b957c4fb0c4 --- /dev/null +++ b/crates/dst/src/main.rs @@ -0,0 +1,124 @@ +use std::time::{SystemTime, UNIX_EPOCH}; + +use clap::{Args, Parser, Subcommand}; +use spacetimedb_dst::{ + config::RunConfig, + targets::descriptor::{RelationalDbCommitlogDescriptor, TargetDescriptor}, + workload::table_ops::TableScenarioId, +}; + +#[derive(Parser, Debug)] +#[command(name = "spacetimedb-dst")] +#[command(about = "Run deterministic simulation targets")] +struct Cli { + #[command(subcommand)] + command: Command, +} + +#[derive(Subcommand, Debug)] +enum Command { + Run(RunArgs), +} + +#[derive(Args, Debug)] +struct RunArgs { + #[arg(long, help = "Seed for generated choices. Defaults to wall-clock time.")] + seed: Option, + #[arg( + long, + help = "Wall-clock soak budget such as 500ms, 10s, 5m, or 1h. Use --max-interactions for exact replay." + )] + duration: Option, + #[arg(long, help = "Deterministic interaction budget. Preferred for replayable failures.")] + max_interactions: Option, + #[arg(long, help = "Scenario to run [default: random-crud]")] + scenario: Option, +} + +fn main() -> anyhow::Result<()> { + init_tracing(); + match Cli::parse().command { + Command::Run(args) => run_command(args), + } +} + +fn init_tracing() { + use tracing_subscriber::{fmt, EnvFilter}; + + let filter = EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")); + let _ = fmt() + .with_env_filter(filter) + .with_target(false) + .with_thread_ids(false) + .with_thread_names(false) + .compact() + .try_init(); +} + +fn run_command(args: RunArgs) -> anyhow::Result<()> { + let seed = resolve_seed(args.seed); + let config = build_config(args.duration.as_deref(), args.max_interactions)?; + let scenario = resolve_scenario(args.scenario.as_deref()); + + run_prepared_target::(seed, scenario, config) +} + +fn run_prepared_target( + seed: u64, + scenario: D::Scenario, + config: RunConfig, +) -> anyhow::Result<()> +where + D: 'static, + D::Scenario: Send + 'static, +{ + D::prepare(seed, &scenario, &config)?; + std::thread::spawn(move || { + let mut runtime = spacetimedb_dst::sim::Runtime::new(seed)?; + runtime.block_on(run_target::(seed, scenario, config)) + }) + .join() + .unwrap_or_else(|payload| std::panic::resume_unwind(payload)) +} + +fn resolve_seed(seed: Option) -> u64 { + seed.unwrap_or_else(|| { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("time went backwards") + .as_nanos() as u64 + }) +} + +fn resolve_scenario(scenario: Option<&str>) -> TableScenarioId { + match scenario { + Some("random-crud") | None => TableScenarioId::RandomCrud, + Some(other) => { + eprintln!("unknown scenario: {other}, using random-crud"); + TableScenarioId::RandomCrud + } + } +} + +fn build_config(duration: Option<&str>, max_interactions: Option) -> anyhow::Result { + Ok(match (duration, max_interactions) { + (Some(duration), Some(max_interactions)) => RunConfig { + max_interactions: Some(max_interactions), + max_duration_ms: Some(spacetimedb_dst::config::parse_duration_spec(duration)?.as_millis() as u64), + }, + (Some(duration), None) => RunConfig::with_duration_spec(duration)?, + (None, Some(max_interactions)) => RunConfig::with_max_interactions(max_interactions), + (None, None) => RunConfig::with_max_interactions(1_000), + }) +} + +#[allow(clippy::disallowed_macros)] +async fn run_target( + seed: u64, + scenario: D::Scenario, + config: RunConfig, +) -> anyhow::Result<()> { + let line = D::run_streaming(seed, scenario, config).await?; + println!("{line}"); + Ok(()) +} diff --git a/crates/dst/src/properties.rs b/crates/dst/src/properties.rs new file mode 100644 index 00000000000..dbe227c2dd9 --- /dev/null +++ b/crates/dst/src/properties.rs @@ -0,0 +1,188 @@ +//! Reusable property runtime shared by DST targets. +//! +//! This module is the boundary between target execution and semantic checking. +//! Targets emit observations and implement [`TargetPropertyAccess`]; property +//! rules compare those observations against either the target's externally +//! visible state, an oracle model, or durable replay state. +//! +//! ## Property Model +//! +//! A property is a named check over a run. It observes generated interactions, +//! target observations, target-visible state, oracle models, and final +//! outcomes. Failures should include a stable property name and enough context +//! to replay the seed or trace. + +mod rules; +mod runtime; + +use std::ops::Bound; + +use spacetimedb_sats::AlgebraicValue; + +use crate::{ + client::SessionId, + schema::{SchemaPlan, SimRow}, + workload::table_ops::{TableErrorKind, TableWorkloadInteraction, TableWorkloadOutcome}, +}; + +pub(crate) use runtime::PropertyRuntime; + +/// Target adapter for property evaluation. +pub(crate) trait TargetPropertyAccess { + fn schema_plan(&self) -> &SchemaPlan; + fn lookup_in_connection(&self, conn: SessionId, table: usize, id: u64) -> Result, String>; + fn collect_rows_in_connection(&self, conn: SessionId, table: usize) -> Result, String>; + fn collect_rows_for_table(&self, table: usize) -> Result, String>; + fn count_rows(&self, table: usize) -> Result; + fn count_by_col_eq(&self, table: usize, col: u16, value: &AlgebraicValue) -> Result; + fn range_scan( + &self, + table: usize, + cols: &[u16], + lower: Bound, + upper: Bound, + ) -> Result, String>; +} + +/// Canonical property IDs that can be selected by targets. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum PropertyKind { + /// Safety: target execution must not panic. + /// + /// Enforced by the shared streaming runner. + NotCrash, + /// Metamorphic: an inserted row is immediately visible to the inserting session. + InsertSelect, + /// Metamorphic: a deleted row disappears from the deleting session's view. + DeleteSelect, + /// Differential: optimized predicate counts agree with direct row projection. + SelectSelectOptimizer, + /// Metamorphic: boolean partitions preserve total cardinality. + WhereTrueFalseNull, + /// Metamorphic: composite index range scans implement excluded upper bounds correctly. + IndexRangeExcluded, + /// Safety: observed errors match the model-predicted error class. + ErrorMatchesOracle, + /// Safety: model-predicted no-op interactions do not mutate visible state. + NoMutationMatchesModel, + /// Model/oracle: point lookups match the oracle session-visible model. + PointLookupMatchesModel, + /// Model/oracle: predicate counts match the oracle session-visible model. + PredicateCountMatchesModel, + /// Model/oracle: range scans match the oracle session-visible model. + RangeScanMatchesModel, + /// Model/oracle: full scans match the oracle session-visible model. + FullScanMatchesModel, +} + +#[derive(Clone, Debug)] +pub(crate) enum TableMutation { + Inserted { + table: usize, + requested: SimRow, + returned: SimRow, + }, + Deleted { + table: usize, + row: SimRow, + }, +} + +#[derive(Clone, Debug)] +pub(crate) enum TableObservation { + Applied, + Mutated { + conn: SessionId, + mutations: Vec, + in_tx: bool, + }, + ObservedError(TableErrorKind), + PointLookup { + conn: SessionId, + table: usize, + id: u64, + actual: Option, + }, + PredicateCount { + conn: SessionId, + table: usize, + col: u16, + value: AlgebraicValue, + actual: usize, + }, + RangeScan { + conn: SessionId, + table: usize, + cols: Vec, + lower: Bound, + upper: Bound, + actual: Vec, + }, + FullScan { + conn: SessionId, + table: usize, + actual: Vec, + }, + CommitOrRollback, +} + +struct PropertyContext<'a> { + access: &'a dyn TargetPropertyAccess, + models: &'a runtime::PropertyModels, +} + +#[derive(Clone, Debug)] +enum PropertyEvent<'a> { + TableInteractionApplied, + RowInserted { + conn: SessionId, + table: usize, + returned: &'a SimRow, + in_tx: bool, + }, + RowDeleted { + conn: SessionId, + table: usize, + row: &'a SimRow, + in_tx: bool, + }, + ObservedError { + observed: TableErrorKind, + predicted: TableErrorKind, + subject: Option<(SessionId, usize)>, + interaction: &'a TableWorkloadInteraction, + }, + NoMutation { + subject: Option<(SessionId, usize)>, + interaction: &'a TableWorkloadInteraction, + observation: &'a TableObservation, + }, + PointLookup { + conn: SessionId, + table: usize, + id: u64, + actual: &'a Option, + }, + PredicateCount { + conn: SessionId, + table: usize, + col: u16, + value: &'a AlgebraicValue, + actual: usize, + }, + RangeScan { + conn: SessionId, + table: usize, + cols: &'a [u16], + lower: &'a Bound, + upper: &'a Bound, + actual: &'a [SimRow], + }, + FullScan { + conn: SessionId, + table: usize, + actual: &'a [SimRow], + }, + CommitOrRollback, + TableWorkloadFinished(&'a TableWorkloadOutcome), +} diff --git a/crates/dst/src/properties/rules.rs b/crates/dst/src/properties/rules.rs new file mode 100644 index 00000000000..9d2552014c2 --- /dev/null +++ b/crates/dst/src/properties/rules.rs @@ -0,0 +1,483 @@ +use std::ops::Bound; + +use spacetimedb_sats::{AlgebraicType, AlgebraicValue}; + +use crate::{ + client::SessionId, + schema::{SchemaPlan, SimRow}, + workload::table_ops::{TableOperation, TableScenario}, +}; + +use super::{PropertyContext, PropertyEvent, PropertyKind, TableMutation, TableObservation}; + +pub(crate) trait PropertyRule { + fn observe(&mut self, ctx: &PropertyContext<'_>, event: PropertyEvent<'_>) -> Result<(), String> { + let _ = ctx; + let _ = event; + Ok(()) + } +} + +pub(super) fn rule_for_kind(kind: PropertyKind) -> Box { + match kind { + PropertyKind::NotCrash => Box::::default(), + PropertyKind::InsertSelect => Box::::default(), + PropertyKind::DeleteSelect => Box::::default(), + PropertyKind::SelectSelectOptimizer => Box::::default(), + PropertyKind::WhereTrueFalseNull => Box::::default(), + PropertyKind::IndexRangeExcluded => Box::::default(), + PropertyKind::ErrorMatchesOracle => Box::::default(), + PropertyKind::NoMutationMatchesModel => Box::::default(), + PropertyKind::PointLookupMatchesModel => Box::::default(), + PropertyKind::PredicateCountMatchesModel => Box::::default(), + PropertyKind::RangeScanMatchesModel => Box::::default(), + PropertyKind::FullScanMatchesModel => Box::::default(), + } +} + +pub(crate) fn oracle_table_state_rule(scenario: S, schema: SchemaPlan) -> Box +where + S: TableScenario + 'static, +{ + Box::new(OracleTableStateRule::new(scenario, schema)) +} + +#[derive(Default)] +struct NotCrashRule; + +impl PropertyRule for NotCrashRule {} + +struct OracleTableStateRule { + scenario: S, + schema: SchemaPlan, +} + +impl OracleTableStateRule { + fn new(scenario: S, schema: SchemaPlan) -> Self { + Self { scenario, schema } + } +} + +impl PropertyRule for OracleTableStateRule { + fn observe(&mut self, ctx: &PropertyContext<'_>, event: PropertyEvent<'_>) -> Result<(), String> { + match event { + PropertyEvent::TableWorkloadFinished(outcome) => { + let expected_rows = ctx.models.table().committed_rows(); + if outcome.final_rows != expected_rows { + return Err(format!( + "[OracleTableState] final table state mismatch: expected={expected_rows:?} actual={:?}", + outcome.final_rows + )); + } + self.scenario + .validate_outcome(&self.schema, outcome) + .map_err(|err| format!("[OracleTableState] scenario invariant failed: {err}")) + } + _ => Ok(()), + } + } +} + +#[derive(Default)] +struct InsertSelectRule; + +impl PropertyRule for InsertSelectRule { + fn observe(&mut self, ctx: &PropertyContext<'_>, event: PropertyEvent<'_>) -> Result<(), String> { + let PropertyEvent::RowInserted { + conn, table, returned, .. + } = event + else { + return Ok(()); + }; + let id = returned.id().ok_or_else(|| "row missing id column".to_string())?; + let found = ctx.access.lookup_in_connection(conn, table, id)?; + if found != Some(returned.clone()) { + return Err(format!( + "[PQS::InsertSelect] row not visible after insert on conn={conn}, table={table}, expected={returned:?}, actual={found:?}" + )); + } + Ok(()) + } +} + +#[derive(Default)] +struct DeleteSelectRule; + +impl PropertyRule for DeleteSelectRule { + fn observe(&mut self, ctx: &PropertyContext<'_>, event: PropertyEvent<'_>) -> Result<(), String> { + let PropertyEvent::RowDeleted { conn, table, row, .. } = event else { + return Ok(()); + }; + let id = row.id().ok_or_else(|| "row missing id column".to_string())?; + if ctx.access.lookup_in_connection(conn, table, id)?.is_some() { + return Err(format!( + "[DeleteSelect] row still visible after delete on conn={conn}, table={table}, row={row:?}" + )); + } + Ok(()) + } +} + +fn post_write_check_tables(ctx: &PropertyContext<'_>, event: &PropertyEvent<'_>) -> Option> { + match event { + PropertyEvent::RowInserted { + table, in_tx: false, .. + } + | PropertyEvent::RowDeleted { + table, in_tx: false, .. + } => Some(vec![*table]), + PropertyEvent::CommitOrRollback => Some((0..ctx.access.schema_plan().tables.len()).collect()), + _ => None, + } +} + +#[derive(Default)] +struct NoRecRule; + +impl PropertyRule for NoRecRule { + fn observe(&mut self, ctx: &PropertyContext<'_>, event: PropertyEvent<'_>) -> Result<(), String> { + let Some(tables) = post_write_check_tables(ctx, &event) else { + return Ok(()); + }; + for table in tables { + let table_plan = ctx + .access + .schema_plan() + .tables + .get(table) + .ok_or_else(|| format!("table {table} out of range"))?; + let Some((col_idx, col_ty)) = table_plan + .columns + .iter() + .enumerate() + .skip(1) + .find(|(_, col)| matches!(col.ty, AlgebraicType::Bool | AlgebraicType::U64)) + .map(|(idx, col)| (idx as u16, &col.ty)) + else { + continue; + }; + let scanned_rows = ctx.access.collect_rows_for_table(table)?; + if scanned_rows.is_empty() { + continue; + } + let predicate_value = match col_ty { + AlgebraicType::Bool => AlgebraicValue::Bool(true), + AlgebraicType::U64 => scanned_rows[0].values[col_idx as usize].clone(), + _ => continue, + }; + let where_count = ctx.access.count_by_col_eq(table, col_idx, &predicate_value)?; + let projected_true_count = scanned_rows + .iter() + .filter(|row| row.values[col_idx as usize] == predicate_value) + .count(); + if where_count != projected_true_count { + return Err(format!( + "[NoREC::SelectSelectOptimizer] mismatch on table={table}, col={col_idx}: where_count={where_count}, projected_true={projected_true_count}" + )); + } + } + Ok(()) + } +} + +#[derive(Default)] +struct TlpRule; + +impl PropertyRule for TlpRule { + fn observe(&mut self, ctx: &PropertyContext<'_>, event: PropertyEvent<'_>) -> Result<(), String> { + let Some(tables) = post_write_check_tables(ctx, &event) else { + return Ok(()); + }; + for table in tables { + let table_plan = ctx + .access + .schema_plan() + .tables + .get(table) + .ok_or_else(|| format!("table {table} out of range"))?; + let Some(col_idx) = table_plan + .columns + .iter() + .enumerate() + .skip(1) + .find(|(_, col)| matches!(col.ty, AlgebraicType::Bool)) + .map(|(idx, _)| idx as u16) + else { + continue; + }; + let total = ctx.access.count_rows(table)?; + let true_count = ctx + .access + .count_by_col_eq(table, col_idx, &AlgebraicValue::Bool(true))?; + let false_count = ctx + .access + .count_by_col_eq(table, col_idx, &AlgebraicValue::Bool(false))?; + let partition_sum = true_count + false_count; + if partition_sum != total { + return Err(format!( + "[TLP::WhereTrueFalseNull|TLP::UNIONAllPreservesCardinality] partition mismatch on table={table}, col={col_idx}: true={true_count}, false={false_count}, total={total}" + )); + } + } + Ok(()) + } +} + +#[derive(Default)] +struct IndexRangeExcludedRule; + +impl PropertyRule for IndexRangeExcludedRule { + fn observe(&mut self, ctx: &PropertyContext<'_>, event: PropertyEvent<'_>) -> Result<(), String> { + let Some(tables) = post_write_check_tables(ctx, &event) else { + return Ok(()); + }; + const MAX_ROWS_FOR_INDEX_SCAN_CHECK: usize = 512; + + for table in tables { + let table_plan = ctx + .access + .schema_plan() + .tables + .get(table) + .ok_or_else(|| format!("table {table} out of range"))?; + let rows = ctx.access.collect_rows_for_table(table)?; + if rows.len() < 2 || rows.len() > MAX_ROWS_FOR_INDEX_SCAN_CHECK { + continue; + } + + for cols in table_plan.extra_indexes.iter().filter(|cols| cols.len() > 1) { + if !cols.iter().all(|&col| { + matches!( + table_plan.columns[col as usize].ty, + AlgebraicType::U64 | AlgebraicType::Bool + ) + }) { + continue; + } + + let mut sorted_rows = rows.clone(); + sorted_rows.sort_by(|lhs, rhs| compare_rows_by_cols(lhs, rhs, cols)); + + let lower_key = sorted_rows[0].project_key(cols).to_algebraic_value(); + let upper_key = sorted_rows[sorted_rows.len() - 1] + .project_key(cols) + .to_algebraic_value(); + let lower = Bound::Included(lower_key.clone()); + let upper = Bound::Excluded(upper_key.clone()); + + let mut expected_rows = sorted_rows + .into_iter() + .filter(|row| { + let key = row.project_key(cols).to_algebraic_value(); + key >= lower_key && key < upper_key + }) + .collect::>(); + expected_rows.sort_by(|lhs, rhs| compare_rows_by_cols(lhs, rhs, cols)); + + let mut actual_rows = ctx.access.range_scan(table, cols, lower, upper)?; + actual_rows.sort_by(|lhs, rhs| compare_rows_by_cols(lhs, rhs, cols)); + + if actual_rows != expected_rows { + return Err(format!( + "[PQS::IndexRangeExcluded] range mismatch on table={table}, cols={cols:?}: expected={expected_rows:?}, actual={actual_rows:?}" + )); + } + } + } + + Ok(()) + } +} + +#[derive(Default)] +struct ErrorMatchesOracleRule; + +impl PropertyRule for ErrorMatchesOracleRule { + fn observe(&mut self, ctx: &PropertyContext<'_>, event: PropertyEvent<'_>) -> Result<(), String> { + let PropertyEvent::ObservedError { + observed, + predicted, + subject, + interaction, + } = event + else { + return Ok(()); + }; + if observed != predicted { + return Err(format!( + "[ErrorMatchesOracle] observed {observed:?}, but model predicted {predicted:?}: {interaction:?}", + )); + } + if let Some((conn, table)) = subject { + assert_visible_rows_match_model(ctx, conn, table, "[ErrorDoesNotMutate]", interaction)?; + } + Ok(()) + } +} + +#[derive(Default)] +struct NoMutationMatchesModelRule; + +impl PropertyRule for NoMutationMatchesModelRule { + fn observe(&mut self, ctx: &PropertyContext<'_>, event: PropertyEvent<'_>) -> Result<(), String> { + let PropertyEvent::NoMutation { + interaction, + subject, + observation, + } = event + else { + return Ok(()); + }; + if let TableOperation::InsertRows { table, rows, .. } = &interaction.op + && let TableObservation::Mutated { mutations, .. } = observation + { + if mutations.len() != rows.len() { + return Err(format!( + "[NoMutationMatchesModel] insert no-op returned wrong mutation count: expected={}, actual={}; interaction={interaction:?}", + rows.len(), + mutations.len() + )); + } + for (row, mutation) in rows.iter().zip(mutations) { + let TableMutation::Inserted { + table: observed_table, + requested, + returned, + } = mutation + else { + return Err(format!( + "[NoMutationMatchesModel] insert no-op returned non-insert mutation: {mutation:?}; interaction={interaction:?}" + )); + }; + if observed_table != table || requested != row || returned != row { + return Err(format!( + "[NoMutationMatchesModel] no-op insert returned row mismatch: expected table={table}, row={row:?}; observed table={observed_table}, requested={requested:?}, returned={returned:?}; interaction={interaction:?}" + )); + } + } + } + + if let Some((conn, table)) = subject { + assert_visible_rows_match_model(ctx, conn, table, "[NoMutationMatchesModel]", interaction)?; + } + Ok(()) + } +} + +fn assert_visible_rows_match_model( + ctx: &PropertyContext<'_>, + conn: SessionId, + table: usize, + property: &str, + interaction: &crate::workload::table_ops::TableWorkloadInteraction, +) -> Result<(), String> { + let mut actual = ctx.access.collect_rows_in_connection(conn, table)?; + actual.sort_by_key(|row| row.id().unwrap_or_default()); + let expected = ctx.models.table().visible_rows(conn, table); + if actual != expected { + return Err(format!( + "{property} visible rows changed unexpectedly on conn={conn}, table={table}: expected={expected:?}, actual={actual:?}; interaction={interaction:?}" + )); + } + Ok(()) +} + +#[derive(Default)] +struct PointLookupMatchesModelRule; + +impl PropertyRule for PointLookupMatchesModelRule { + fn observe(&mut self, ctx: &PropertyContext<'_>, event: PropertyEvent<'_>) -> Result<(), String> { + let PropertyEvent::PointLookup { + conn, + table, + id, + actual, + } = event + else { + return Ok(()); + }; + let expected = ctx.models.table().lookup_by_id(conn, table, id); + if *actual != expected { + return Err(format!( + "[Model::PointLookup] mismatch conn={conn}, table={table}, id={id}: expected={expected:?}, actual={actual:?}" + )); + } + Ok(()) + } +} + +#[derive(Default)] +struct PredicateCountMatchesModelRule; + +impl PropertyRule for PredicateCountMatchesModelRule { + fn observe(&mut self, ctx: &PropertyContext<'_>, event: PropertyEvent<'_>) -> Result<(), String> { + let PropertyEvent::PredicateCount { + conn, + table, + col, + value, + actual, + } = event + else { + return Ok(()); + }; + let expected = ctx.models.table().predicate_count(conn, table, col, value); + if actual != expected { + return Err(format!( + "[Model::PredicateCount] mismatch conn={conn}, table={table}, col={col}, value={value:?}: expected={expected}, actual={actual}" + )); + } + Ok(()) + } +} + +#[derive(Default)] +struct RangeScanMatchesModelRule; + +impl PropertyRule for RangeScanMatchesModelRule { + fn observe(&mut self, ctx: &PropertyContext<'_>, event: PropertyEvent<'_>) -> Result<(), String> { + let PropertyEvent::RangeScan { + conn, + table, + cols, + lower, + upper, + actual, + } = event + else { + return Ok(()); + }; + let expected = ctx.models.table().range_scan(conn, table, cols, lower, upper); + if actual != expected.as_slice() { + return Err(format!( + "[Model::RangeScan] mismatch conn={conn}, table={table}, cols={cols:?}, lower={lower:?}, upper={upper:?}: expected={expected:?}, actual={actual:?}" + )); + } + Ok(()) + } +} + +#[derive(Default)] +struct FullScanMatchesModelRule; + +impl PropertyRule for FullScanMatchesModelRule { + fn observe(&mut self, ctx: &PropertyContext<'_>, event: PropertyEvent<'_>) -> Result<(), String> { + let PropertyEvent::FullScan { conn, table, actual } = event else { + return Ok(()); + }; + let expected = ctx.models.table().full_scan(conn, table); + if actual != expected.as_slice() { + return Err(format!( + "[Model::FullScan] mismatch conn={conn}, table={table}: expected={expected:?}, actual={actual:?}" + )); + } + Ok(()) + } +} + +fn compare_rows_by_cols(lhs: &SimRow, rhs: &SimRow, cols: &[u16]) -> std::cmp::Ordering { + lhs.project_key(cols) + .to_algebraic_value() + .cmp(&rhs.project_key(cols).to_algebraic_value()) + .then_with(|| lhs.values.cmp(&rhs.values)) +} diff --git a/crates/dst/src/properties/runtime.rs b/crates/dst/src/properties/runtime.rs new file mode 100644 index 00000000000..52951b10b17 --- /dev/null +++ b/crates/dst/src/properties/runtime.rs @@ -0,0 +1,427 @@ +use std::ops::Bound; + +use spacetimedb_sats::AlgebraicValue; + +use crate::{ + client::SessionId, + core::{StreamingProperties, TargetEngine}, + schema::{SchemaPlan, SimRow}, + workload::table_ops::{PredictedOutcome, TableErrorKind, TableOracle, TableWorkloadInteraction, TableWorkloadOutcome}, +}; + +use super::{ + rules::{oracle_table_state_rule, rule_for_kind, PropertyRule}, + PropertyContext, PropertyEvent, PropertyKind, TableMutation, TableObservation, TargetPropertyAccess, +}; + +#[derive(Clone, Debug)] +pub(super) struct PropertyModels { + table: TableModel, +} + +#[derive(Clone, Debug)] +pub(super) struct TableModel { + oracle: TableOracle, +} + +impl PropertyModels { + pub(super) fn new(table_count: usize, num_connections: usize) -> Self { + Self { + table: TableModel { + oracle: TableOracle::new(table_count, num_connections), + }, + } + } + + pub(super) fn table(&self) -> &TableModel { + &self.table + } + + fn predict(&self, interaction: &TableWorkloadInteraction) -> Result { + self.table.oracle.predict(&interaction.op) + } + + fn apply(&mut self, interaction: &TableWorkloadInteraction) { + self.table.oracle.apply(&interaction.op); + } +} + +impl TableModel { + pub(super) fn committed_rows(&self) -> Vec> { + self.oracle.clone().committed_rows() + } + + pub(super) fn lookup_by_id(&self, conn: SessionId, table: usize, id: u64) -> Option { + self.oracle.lookup_by_id(conn, table, id) + } + + pub(super) fn predicate_count(&self, conn: SessionId, table: usize, col: u16, value: &AlgebraicValue) -> usize { + self.oracle.predicate_count(conn, table, col, value) + } + + pub(super) fn range_scan( + &self, + conn: SessionId, + table: usize, + cols: &[u16], + lower: &Bound, + upper: &Bound, + ) -> Vec { + self.oracle.range_scan(conn, table, cols, lower, upper) + } + + pub(super) fn full_scan(&self, conn: SessionId, table: usize) -> Vec { + let mut rows = self.oracle.visible_rows(conn, table); + rows.sort_by_key(|row| row.id().unwrap_or_default()); + rows + } + + pub(super) fn visible_rows(&self, conn: SessionId, table: usize) -> Vec { + let mut rows = self.oracle.visible_rows(conn, table); + rows.sort_by_key(|row| row.id().unwrap_or_default()); + rows + } +} + +/// Mutable runtime holding selected property implementations. +pub(crate) struct PropertyRuntime { + rules: Vec, + models: PropertyModels, +} + +impl PropertyRuntime { + pub fn with_kinds(kinds: &[PropertyKind]) -> Self { + let rules = kinds.iter().copied().map(rule_for_kind).map(RuleEntry::new).collect(); + Self { + rules, + models: PropertyModels::new(0, 0), + } + } + + pub fn for_table_workload(scenario: S, schema: SchemaPlan, num_connections: usize) -> Self + where + S: crate::workload::table_ops::TableScenario + 'static, + { + let mut runtime = Self { + models: PropertyModels::new(schema.tables.len(), num_connections), + ..Self::default() + }; + runtime + .rules + .push(RuleEntry::new(oracle_table_state_rule(scenario, schema))); + runtime + } + + fn observe_event(&mut self, access: &dyn TargetPropertyAccess, event: PropertyEvent<'_>) -> Result<(), String> { + let ctx = PropertyContext { + access, + models: &self.models, + }; + for entry in &mut self.rules { + entry.rule.observe(&ctx, event.clone())?; + } + Ok(()) + } + + fn on_table_interaction( + &mut self, + access: &dyn TargetPropertyAccess, + interaction: &TableWorkloadInteraction, + ) -> Result<(), String> { + self.models.apply(interaction); + self.observe_event(access, PropertyEvent::TableInteractionApplied) + } + + fn on_mutations( + &mut self, + access: &dyn TargetPropertyAccess, + conn: SessionId, + mutations: &[TableMutation], + in_tx: bool, + ) -> Result<(), String> { + for mutation in mutations { + match mutation { + TableMutation::Inserted { + table, + requested: _, + returned, + } => self.observe_event( + access, + PropertyEvent::RowInserted { + conn, + table: *table, + returned, + in_tx, + }, + )?, + TableMutation::Deleted { table, row } => self.observe_event( + access, + PropertyEvent::RowDeleted { + conn, + table: *table, + row, + in_tx, + }, + )?, + } + } + Ok(()) + } + + fn on_observed_error( + &mut self, + access: &dyn TargetPropertyAccess, + observed: TableErrorKind, + predicted: TableErrorKind, + subject: Option<(SessionId, usize)>, + interaction: &TableWorkloadInteraction, + ) -> Result<(), String> { + self.observe_event( + access, + PropertyEvent::ObservedError { + observed, + predicted, + subject, + interaction, + }, + ) + } + + fn on_no_mutation( + &mut self, + access: &dyn TargetPropertyAccess, + subject: Option<(SessionId, usize)>, + interaction: &TableWorkloadInteraction, + observation: &TableObservation, + ) -> Result<(), String> { + self.observe_event( + access, + PropertyEvent::NoMutation { + subject, + interaction, + observation, + }, + ) + } + + fn on_point_lookup( + &mut self, + access: &dyn TargetPropertyAccess, + conn: SessionId, + table: usize, + id: u64, + actual: &Option, + ) -> Result<(), String> { + self.observe_event( + access, + PropertyEvent::PointLookup { + conn, + table, + id, + actual, + }, + ) + } + + fn on_predicate_count( + &mut self, + access: &dyn TargetPropertyAccess, + conn: SessionId, + table: usize, + col: u16, + value: &AlgebraicValue, + actual: usize, + ) -> Result<(), String> { + self.observe_event( + access, + PropertyEvent::PredicateCount { + conn, + table, + col, + value, + actual, + }, + ) + } + + #[allow(clippy::too_many_arguments)] + fn on_range_scan( + &mut self, + access: &dyn TargetPropertyAccess, + conn: SessionId, + table: usize, + cols: &[u16], + lower: &Bound, + upper: &Bound, + actual: &[SimRow], + ) -> Result<(), String> { + self.observe_event( + access, + PropertyEvent::RangeScan { + conn, + table, + cols, + lower, + upper, + actual, + }, + ) + } + + fn on_full_scan( + &mut self, + access: &dyn TargetPropertyAccess, + conn: SessionId, + table: usize, + actual: &[SimRow], + ) -> Result<(), String> { + self.observe_event(access, PropertyEvent::FullScan { conn, table, actual }) + } + + fn on_commit_or_rollback(&mut self, access: &dyn TargetPropertyAccess) -> Result<(), String> { + self.observe_event(access, PropertyEvent::CommitOrRollback) + } + + fn on_table_workload_finish( + &mut self, + access: &dyn TargetPropertyAccess, + outcome: &TableWorkloadOutcome, + ) -> Result<(), String> { + self.observe_event(access, PropertyEvent::TableWorkloadFinished(outcome)) + } + + fn observe_table_observation( + &mut self, + access: &dyn TargetPropertyAccess, + interaction: &TableWorkloadInteraction, + observation: &TableObservation, + ) -> Result<(), String> { + let prediction = self.models.predict(interaction)?; + match (&prediction, observed_error_kind(observation)) { + (PredictedOutcome::Error { kind, subject }, Some(observed)) => { + self.on_observed_error(access, observed, *kind, *subject, interaction)?; + return Ok(()); + } + (PredictedOutcome::Error { kind, .. }, None) => { + return Err(format!( + "[ErrorMatchesOracle] expected {kind:?}, observed successful result {observation:?} for {interaction:?}" + )); + } + (PredictedOutcome::Applied, Some(observed)) => { + return Err(format!( + "[ErrorMatchesOracle] expected success, observed {observed:?} for {interaction:?}" + )); + } + (PredictedOutcome::Applied, None) => self.on_table_interaction(access, interaction)?, + (PredictedOutcome::NoMutation { subject: _ }, Some(observed)) => { + return Err(format!( + "[NoMutationMatchesModel] expected no mutation, observed {observed:?} for {interaction:?}" + )); + } + (PredictedOutcome::NoMutation { subject }, None) => { + self.on_no_mutation(access, *subject, interaction, observation)?; + } + } + + match observation { + TableObservation::Applied => {} + TableObservation::Mutated { conn, mutations, in_tx } => { + self.on_mutations(access, *conn, mutations, *in_tx)? + } + TableObservation::ObservedError(_) => {} + TableObservation::PointLookup { + conn, + table, + id, + actual, + } => self.on_point_lookup(access, *conn, *table, *id, actual)?, + TableObservation::PredicateCount { + conn, + table, + col, + value, + actual, + } => self.on_predicate_count(access, *conn, *table, *col, value, *actual)?, + TableObservation::RangeScan { + conn, + table, + cols, + lower, + upper, + actual, + } => self.on_range_scan(access, *conn, *table, cols, lower, upper, actual)?, + TableObservation::FullScan { conn, table, actual } => self.on_full_scan(access, *conn, *table, actual)?, + TableObservation::CommitOrRollback => {} + } + + if matches!(observation, TableObservation::CommitOrRollback) { + self.on_commit_or_rollback(access)?; + } + Ok(()) + } +} + +impl StreamingProperties for PropertyRuntime +where + E: TargetEngine< + TableWorkloadInteraction, + Observation = TableObservation, + Outcome = TableWorkloadOutcome, + Error = String, + > + TargetPropertyAccess, +{ + fn observe( + &mut self, + engine: &E, + interaction: &TableWorkloadInteraction, + observation: &TableObservation, + ) -> Result<(), String> { + self.observe_table_observation(engine, interaction, observation) + } + + fn finish(&mut self, engine: &E, outcome: &TableWorkloadOutcome) -> Result<(), String> { + self.on_table_workload_finish(engine, outcome) + } +} + +struct RuleEntry { + rule: Box, +} + +impl RuleEntry { + fn new(rule: Box) -> Self { + Self { rule } + } +} + +impl Default for PropertyRuntime { + fn default() -> Self { + Self::with_kinds(&[ + PropertyKind::NotCrash, + PropertyKind::InsertSelect, + PropertyKind::DeleteSelect, + PropertyKind::SelectSelectOptimizer, + PropertyKind::WhereTrueFalseNull, + PropertyKind::IndexRangeExcluded, + PropertyKind::ErrorMatchesOracle, + PropertyKind::NoMutationMatchesModel, + PropertyKind::PointLookupMatchesModel, + PropertyKind::PredicateCountMatchesModel, + PropertyKind::RangeScanMatchesModel, + PropertyKind::FullScanMatchesModel, + ]) + } +} + +fn observed_error_kind(observation: &TableObservation) -> Option { + match observation { + TableObservation::ObservedError(kind) => Some(*kind), + TableObservation::Applied + | TableObservation::Mutated { .. } + | TableObservation::PointLookup { .. } + | TableObservation::PredicateCount { .. } + | TableObservation::RangeScan { .. } + | TableObservation::FullScan { .. } + | TableObservation::CommitOrRollback => None, + } +} diff --git a/crates/dst/src/schema.rs b/crates/dst/src/schema.rs new file mode 100644 index 00000000000..fdaaa627954 --- /dev/null +++ b/crates/dst/src/schema.rs @@ -0,0 +1,196 @@ +//! Shared schema and row model used by DST targets. + +use spacetimedb_sats::{AlgebraicType, AlgebraicValue, ProductValue}; + +use crate::sim::Rng; + +/// Generated schema for one simulator case. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct SchemaPlan { + /// User-visible tables installed before the workload starts. + pub tables: Vec, +} + +/// Table definition used by simulators. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct TablePlan { + /// Stable logical table name used in generated interactions and assertions. + pub name: String, + /// Ordered column definitions. Column 0 is treated as the primary id column. + pub columns: Vec, + /// Additional indexed column sets beyond the implicit primary id index. + /// + /// A value like `[1]` means a single-column secondary index on column 1. + /// A value like `[0, 1]` means a composite btree index over columns 0 and 1. + pub extra_indexes: Vec>, +} + +/// Column definition used by simulators. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ColumnPlan { + /// Column name installed into the target schema. + pub name: String, + /// Algebraic type for generated values in this column. + pub ty: AlgebraicType, +} + +/// Serializable row representation used by generated interactions. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct SimRow { + /// Column values in schema order. + pub values: Vec, +} + +pub fn generate_supported_type(rng: &Rng) -> AlgebraicType { + match rng.index(12) { + 0 => AlgebraicType::Bool, + 1 => AlgebraicType::I8, + 2 => AlgebraicType::U8, + 3 => AlgebraicType::I16, + 4 => AlgebraicType::U16, + 5 => AlgebraicType::I32, + 6 => AlgebraicType::U32, + 7 => AlgebraicType::I64, + 8 => AlgebraicType::U64, + 9 => AlgebraicType::I128, + 10 => AlgebraicType::U128, + _ => AlgebraicType::String, + } +} + +pub fn generate_value_for_type(rng: &Rng, ty: &AlgebraicType, idx: usize) -> AlgebraicValue { + if rng.index(5) == 0 { + return edge_value_for_type(rng, ty, idx); + } + + match ty { + AlgebraicType::Bool => AlgebraicValue::Bool(rng.index(2) == 0), + AlgebraicType::I8 => AlgebraicValue::I8(((rng.next_u64() % 64) as i8) - 32), + AlgebraicType::U8 => AlgebraicValue::U8((rng.next_u64() % u8::MAX as u64) as u8), + AlgebraicType::I16 => AlgebraicValue::I16(((rng.next_u64() % 2048) as i16) - 1024), + AlgebraicType::U16 => AlgebraicValue::U16((rng.next_u64() % u16::MAX as u64) as u16), + AlgebraicType::I32 => AlgebraicValue::I32(((rng.next_u64() % 200_000) as i32) - 100_000), + AlgebraicType::U32 => AlgebraicValue::U32((rng.next_u64() % 1_000_000) as u32), + AlgebraicType::I64 => AlgebraicValue::I64(((rng.next_u64() % 2_000_000) as i64) - 1_000_000), + AlgebraicType::U64 => AlgebraicValue::U64((rng.next_u64() % 1000) + idx as u64), + AlgebraicType::I128 => { + let v = ((rng.next_u64() % 2_000_000) as i128) - 1_000_000; + AlgebraicValue::I128(v.into()) + } + AlgebraicType::U128 => { + let v = (rng.next_u64() % 2_000_000) as u128; + AlgebraicValue::U128(v.into()) + } + AlgebraicType::String => AlgebraicValue::String(format!("v{}_{}", idx, rng.next_u64() % 10_000).into()), + other => panic!("unsupported generated column type: {other:?}"), + } +} + +pub fn default_value_for_type(ty: &AlgebraicType) -> AlgebraicValue { + match ty { + AlgebraicType::Bool => AlgebraicValue::Bool(false), + AlgebraicType::I8 => AlgebraicValue::I8(0), + AlgebraicType::U8 => AlgebraicValue::U8(0), + AlgebraicType::I16 => AlgebraicValue::I16(0), + AlgebraicType::U16 => AlgebraicValue::U16(0), + AlgebraicType::I32 => AlgebraicValue::I32(0), + AlgebraicType::U32 => AlgebraicValue::U32(0), + AlgebraicType::I64 => AlgebraicValue::I64(0), + AlgebraicType::U64 => AlgebraicValue::U64(0), + AlgebraicType::I128 => AlgebraicValue::I128(0.into()), + AlgebraicType::U128 => AlgebraicValue::U128(0.into()), + AlgebraicType::String => AlgebraicValue::String("".into()), + other => panic!("unsupported generated column type: {other:?}"), + } +} + +pub fn distinct_value_for_type(ty: &AlgebraicType, current: &AlgebraicValue) -> AlgebraicValue { + let default = default_value_for_type(ty); + if &default != current { + return default; + } + + match ty { + AlgebraicType::Bool => AlgebraicValue::Bool(true), + AlgebraicType::I8 => AlgebraicValue::I8(1), + AlgebraicType::U8 => AlgebraicValue::U8(1), + AlgebraicType::I16 => AlgebraicValue::I16(1), + AlgebraicType::U16 => AlgebraicValue::U16(1), + AlgebraicType::I32 => AlgebraicValue::I32(1), + AlgebraicType::U32 => AlgebraicValue::U32(1), + AlgebraicType::I64 => AlgebraicValue::I64(1), + AlgebraicType::U64 => AlgebraicValue::U64(1), + AlgebraicType::I128 => AlgebraicValue::I128(1.into()), + AlgebraicType::U128 => AlgebraicValue::U128(1.into()), + AlgebraicType::String => AlgebraicValue::String("dst_unique_conflict".into()), + other => panic!("unsupported generated column type: {other:?}"), + } +} + +fn edge_value_for_type(rng: &Rng, ty: &AlgebraicType, idx: usize) -> AlgebraicValue { + match ty { + AlgebraicType::Bool => AlgebraicValue::Bool(rng.index(2) == 0), + AlgebraicType::I8 => [i8::MIN, -1, 0, 1, i8::MAX][rng.index(5)].into(), + AlgebraicType::U8 => [0, 1, u8::MAX][rng.index(3)].into(), + AlgebraicType::I16 => [i16::MIN, -1, 0, 1, i16::MAX][rng.index(5)].into(), + AlgebraicType::U16 => [0, 1, u16::MAX][rng.index(3)].into(), + AlgebraicType::I32 => [i32::MIN, -1, 0, 1, i32::MAX][rng.index(5)].into(), + AlgebraicType::U32 => [0, 1, u32::MAX][rng.index(3)].into(), + AlgebraicType::I64 => [i64::MIN, -1, 0, 1, i64::MAX][rng.index(5)].into(), + AlgebraicType::U64 => [0, 1, u64::MAX.saturating_sub(idx as u64)][rng.index(3)].into(), + AlgebraicType::I128 => { + let value = [i128::MIN, -1, 0, 1, i128::MAX][rng.index(5)]; + AlgebraicValue::I128(value.into()) + } + AlgebraicType::U128 => { + let value = [0, 1, u128::MAX][rng.index(3)]; + AlgebraicValue::U128(value.into()) + } + AlgebraicType::String => match rng.index(5) { + 0 => AlgebraicValue::String("".into()), + 1 => AlgebraicValue::String("same".into()), + 2 => AlgebraicValue::String("x".repeat(512).into()), + 3 => AlgebraicValue::String(format!("edge_{}", char::from_u32(0x2603).expect("valid char")).into()), + _ => AlgebraicValue::String(format!("v{idx}_edge").into()), + }, + other => panic!("unsupported generated column type: {other:?}"), + } +} + +impl SimRow { + pub fn to_product_value(&self) -> ProductValue { + ProductValue::from_iter(self.values.iter().cloned()) + } + + pub fn to_bsatn(&self) -> anyhow::Result> { + Ok(spacetimedb_sats::bsatn::to_vec(&self.to_product_value())?) + } + + pub fn from_product_value(value: ProductValue) -> Self { + SimRow { + values: value.elements.to_vec(), + } + } + + pub fn project_key(&self, cols: &[u16]) -> Self { + let values = cols + .iter() + .map(|&col| self.values[col as usize].clone()) + .collect::>(); + SimRow { values } + } + + pub fn to_algebraic_value(&self) -> AlgebraicValue { + match self.values.as_slice() { + [value] => value.clone(), + _ => ProductValue::from_iter(self.values.iter().cloned()).into(), + } + } + + pub fn id(&self) -> Option { + match self.values.first() { + Some(AlgebraicValue::U64(value)) => Some(*value), + _ => None, + } + } +} diff --git a/crates/dst/src/sim/commitlog.rs b/crates/dst/src/sim/commitlog.rs new file mode 100644 index 00000000000..7fdd83618fc --- /dev/null +++ b/crates/dst/src/sim/commitlog.rs @@ -0,0 +1,317 @@ +//! Commitlog storage fault-injection support for DST targets. + +use std::{ + fmt, + io::{self, BufRead, Read, Seek, Write}, +}; + +use spacetimedb_commitlog::{ + repo::{ + CompressOnce, CompressionStats, Repo, RepoWithoutLockFile, SegmentLen, SegmentReader, TxOffset, TxOffsetIndex, TxOffsetIndexMut, + }, + segment::{FileLike, Header}, +}; + +use crate::sim::storage_faults::{ + is_injected_fault_text, ShortIoKind, StorageFaultConfig, StorageFaultController, StorageFaultDomain, + StorageFaultKind, StorageFaultSummary, +}; + +pub(crate) type CommitlogFaultConfig = StorageFaultConfig; +pub(crate) type CommitlogFaultSummary = StorageFaultSummary; + +/// Returns true if `text` contains an error created by this fault layer. +pub(crate) fn is_injected_disk_error_text(text: &str) -> bool { + is_injected_fault_text(StorageFaultDomain::Disk, text) +} + +/// DST-only repo wrapper that makes the in-memory commitlog backend behave less like RAM. +/// +/// Faults stay within normal file API semantics: calls may take deterministic simulated time, +/// reads/writes may complete partially, and configured calls may return transient I/O errors. +/// The wrapper deliberately avoids corruption or crash-style partial persistence; those need a +/// stronger durability model before we enable them. +#[derive(Clone)] +pub(crate) struct FaultableRepo { + inner: R, + faults: StorageFaultController, +} + +impl FaultableRepo { + pub(crate) fn new(inner: R, config: CommitlogFaultConfig) -> Self { + Self { + inner, + faults: StorageFaultController::new(config, StorageFaultDomain::Disk), + } + } + + pub(crate) fn enable_faults(&self) { + self.faults.enable(); + } + + pub(crate) fn fault_summary(&self) -> CommitlogFaultSummary { + self.faults.summary() + } + + pub(crate) fn with_faults_suspended(&self, f: impl FnOnce() -> T) -> T { + self.faults.with_suspended(f) + } +} + +impl fmt::Display for FaultableRepo { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}+faultable({:?})", self.inner, self.faults.summary().profile) + } +} + +impl Repo for FaultableRepo { + type SegmentWriter = FaultableSegment; + type SegmentReader = FaultableReader; + + fn create_segment(&self, offset: u64, header: Header) -> io::Result { + self.faults.maybe_latency(); + self.faults.maybe_error(StorageFaultKind::NoSpace)?; + self.faults.maybe_error(StorageFaultKind::Open)?; + self.inner + .create_segment(offset, header) + .map(|inner| FaultableSegment::new(inner, self.faults.clone())) + } + + fn open_segment_reader(&self, offset: u64) -> io::Result { + self.faults.maybe_latency(); + self.faults.maybe_error(StorageFaultKind::Open)?; + self.inner + .open_segment_reader(offset) + .map(|inner| FaultableReader::new(inner, self.faults.clone())) + } + + fn open_segment_writer(&self, offset: u64) -> io::Result { + self.faults.maybe_latency(); + self.faults.maybe_error(StorageFaultKind::NoSpace)?; + self.faults.maybe_error(StorageFaultKind::Open)?; + self.inner + .open_segment_writer(offset) + .map(|inner| FaultableSegment::new(inner, self.faults.clone())) + } + + fn segment_file_path(&self, offset: u64) -> Option { + self.inner.segment_file_path(offset) + } + + fn remove_segment(&self, offset: u64) -> io::Result<()> { + self.faults.maybe_latency(); + self.faults.maybe_error(StorageFaultKind::NoSpace)?; + self.faults.maybe_error(StorageFaultKind::Metadata)?; + self.inner.remove_segment(offset) + } + + fn compress_segment_with(&self, offset: u64, f: impl CompressOnce) -> io::Result { + self.faults.maybe_latency(); + self.faults.maybe_error(StorageFaultKind::NoSpace)?; + self.faults.maybe_error(StorageFaultKind::Metadata)?; + self.inner.compress_segment_with(offset, f) + } + + fn existing_offsets(&self) -> io::Result> { + self.faults.maybe_latency(); + self.faults.maybe_error(StorageFaultKind::NoSpace)?; + self.faults.maybe_error(StorageFaultKind::Metadata)?; + self.inner.existing_offsets() + } + + fn create_offset_index(&self, offset: TxOffset, cap: u64) -> io::Result { + self.faults.maybe_latency(); + self.faults.maybe_error(StorageFaultKind::NoSpace)?; + self.faults.maybe_error(StorageFaultKind::Metadata)?; + self.inner.create_offset_index(offset, cap) + } + + fn remove_offset_index(&self, offset: TxOffset) -> io::Result<()> { + self.faults.maybe_latency(); + self.faults.maybe_error(StorageFaultKind::NoSpace)?; + self.faults.maybe_error(StorageFaultKind::Metadata)?; + self.inner.remove_offset_index(offset) + } + + fn get_offset_index(&self, offset: TxOffset) -> io::Result { + self.faults.maybe_latency(); + self.faults.maybe_error(StorageFaultKind::NoSpace)?; + self.faults.maybe_error(StorageFaultKind::Metadata)?; + self.inner.get_offset_index(offset) + } +} + +impl RepoWithoutLockFile for FaultableRepo {} + +pub(crate) struct FaultableSegment { + inner: S, + faults: StorageFaultController, +} + +impl FaultableSegment { + fn new(inner: S, faults: StorageFaultController) -> Self { + Self { inner, faults } + } +} + +impl Read for FaultableSegment { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + self.faults.maybe_latency(); + self.faults.maybe_error(StorageFaultKind::Read)?; + let len = self.faults.maybe_short_len(buf.len(), ShortIoKind::Read); + self.inner.read(&mut buf[..len]) + } +} + +impl Write for FaultableSegment { + fn write(&mut self, buf: &[u8]) -> io::Result { + self.faults.maybe_latency(); + self.faults.check_pending_error(StorageFaultKind::Write)?; + self.faults.maybe_error(StorageFaultKind::NoSpace)?; + self.faults.maybe_error(StorageFaultKind::Write)?; + let is_partial = self.faults.sample_partial_failure(); + let len = self.faults.maybe_short_len(buf.len(), ShortIoKind::Write); + let n = self.inner.write(&buf[..len])?; + if is_partial && n > 0 { + self.faults.arm_pending_error(); + } + Ok(n) + } + + fn flush(&mut self) -> io::Result<()> { + self.faults.maybe_latency(); + self.faults.check_pending_error(StorageFaultKind::Flush)?; + self.faults.maybe_error(StorageFaultKind::NoSpace)?; + self.faults.maybe_error(StorageFaultKind::Flush)?; + self.inner.flush() + } +} + +impl Seek for FaultableSegment { + fn seek(&mut self, pos: io::SeekFrom) -> io::Result { + self.faults.maybe_latency(); + self.inner.seek(pos) + } +} + +impl SegmentLen for FaultableSegment { + fn segment_len(&mut self) -> io::Result { + self.faults.maybe_latency(); + self.faults.maybe_error(StorageFaultKind::Metadata)?; + self.inner.segment_len() + } +} + +impl FileLike for FaultableSegment { + fn fsync(&mut self) -> io::Result<()> { + self.faults.maybe_latency(); + self.faults.check_pending_error(StorageFaultKind::Fsync)?; + self.faults.maybe_error(StorageFaultKind::NoSpace)?; + self.faults.maybe_error(StorageFaultKind::Fsync)?; + self.inner.fsync() + } + + fn ftruncate(&mut self, tx_offset: u64, size: u64) -> io::Result<()> { + self.faults.maybe_latency(); + self.faults.check_pending_error(StorageFaultKind::Metadata)?; + self.faults.maybe_error(StorageFaultKind::NoSpace)?; + self.faults.maybe_error(StorageFaultKind::Metadata)?; + self.inner.ftruncate(tx_offset, size) + } +} + +pub(crate) struct FaultableReader { + inner: S, + faults: StorageFaultController, +} + +impl FaultableReader { + fn new(inner: S, faults: StorageFaultController) -> Self { + Self { inner, faults } + } +} + +impl Read for FaultableReader { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + self.faults.maybe_latency(); + self.faults.maybe_error(StorageFaultKind::Read)?; + let len = self.faults.maybe_short_len(buf.len(), ShortIoKind::Read); + self.inner.read(&mut buf[..len]) + } +} + +impl BufRead for FaultableReader { + fn fill_buf(&mut self) -> io::Result<&[u8]> { + self.faults.maybe_latency(); + self.faults.maybe_error(StorageFaultKind::Read)?; + let buf = self.inner.fill_buf()?; + let len = self.faults.maybe_short_len(buf.len(), ShortIoKind::Read); + Ok(&buf[..len]) + } + + fn consume(&mut self, amount: usize) { + self.inner.consume(amount); + } +} + +impl Seek for FaultableReader { + fn seek(&mut self, pos: io::SeekFrom) -> io::Result { + self.faults.maybe_latency(); + self.inner.seek(pos) + } +} + +impl SegmentLen for FaultableReader { + fn segment_len(&mut self) -> io::Result { + self.faults.maybe_latency(); + self.faults.maybe_error(StorageFaultKind::Metadata)?; + self.inner.segment_len() + } +} + +impl SegmentReader for FaultableReader { + fn sealed(&self) -> bool { + self.inner.sealed() + } +} + +#[cfg(test)] +mod tests { + use std::io::{BufRead, Cursor}; + + use crate::{config::CommitlogFaultProfile, sim}; + + use super::*; + + fn always_short_read_config() -> CommitlogFaultConfig { + CommitlogFaultConfig { + profile: CommitlogFaultProfile::Default, + latency_prob: 0.0, + long_latency_prob: 0.0, + short_io_prob: 1.0, + read_error_prob: 0.0, + write_error_prob: 0.0, + flush_error_prob: 0.0, + fsync_error_prob: 0.0, + open_error_prob: 0.0, + metadata_error_prob: 0.0, + max_short_io_divisor: 2, + no_space_prob: 0.0, + partial_failure_prob: 0.0, + } + } + + #[test] + fn buf_read_path_applies_short_read_faults() { + let mut runtime = sim::Runtime::new(55).unwrap(); + let handle = runtime.handle(); + handle.enable_buggify(); + runtime.block_on(async { + let faults = StorageFaultController::new(always_short_read_config(), StorageFaultDomain::Disk); + let mut reader = FaultableReader::new(Cursor::new(vec![1, 2, 3, 4]), faults.clone()); + + assert_eq!(reader.fill_buf().unwrap(), &[1, 2]); + assert_eq!(faults.summary().short_read, 1); + }); + } +} diff --git a/crates/dst/src/sim/mod.rs b/crates/dst/src/sim/mod.rs new file mode 100644 index 00000000000..51cea430fc6 --- /dev/null +++ b/crates/dst/src/sim/mod.rs @@ -0,0 +1,118 @@ +//! Local simulation shim for the DST crate. +//! +//! This module is deliberately small, but its executor shape follows madsim's: +//! futures are scheduled as runnable tasks and the ready queue is sampled by a +//! deterministic RNG instead of being driven by a package-level async runtime. + +pub(crate) mod commitlog; +pub(crate) mod snapshot; +pub(crate) mod storage_faults; + +use std::{cell::RefCell, future::Future, time::Duration}; + +pub use spacetimedb_runtime::sim::{yield_now, Handle, JoinHandle, Node, NodeBuilder, NodeId, Rng}; + +thread_local! { + static CURRENT_HANDLE: RefCell> = const { RefCell::new(None) }; +} + +struct CurrentHandleGuard { + previous: Option, +} + +fn enter_current_handle(handle: Handle) -> CurrentHandleGuard { + let previous = CURRENT_HANDLE.with(|slot| slot.replace(Some(handle))); + CurrentHandleGuard { previous } +} + +impl Drop for CurrentHandleGuard { + fn drop(&mut self) { + CURRENT_HANDLE.with(|slot| { + let _ = slot.replace(self.previous.take()); + }); + } +} + +pub(crate) fn current_handle() -> Option { + CURRENT_HANDLE.with(|slot| slot.borrow().clone()) +} + +const GAMMA: u64 = 0x9e37_79b9_7f4a_7c15; + +fn splitmix64(mut x: u64) -> u64 { + x = x.wrapping_add(GAMMA); + x = (x ^ (x >> 30)).wrapping_mul(0xbf58_476d_1ce4_e5b9); + x = (x ^ (x >> 27)).wrapping_mul(0x94d0_49bb_1331_11eb); + x ^ (x >> 31) +} + +pub(crate) fn fork_seed(seed: u64, discriminator: u64) -> u64 { + splitmix64(seed ^ discriminator.wrapping_mul(GAMMA)) +} + +/// DST-facing wrapper that keeps the top-level seed type local to this crate. +pub struct Runtime { + inner: spacetimedb_runtime::sim::Runtime, +} + +impl Runtime { + pub fn new(seed: u64) -> anyhow::Result { + Ok(Self { + inner: spacetimedb_runtime::sim::Runtime::new(seed), + }) + } + + pub fn block_on(&mut self, future: F) -> F::Output { + let _guard = enter_current_handle(self.inner.handle()); + spacetimedb_runtime::sim_std::block_on(&mut self.inner, future) + } + + pub fn elapsed(&self) -> Duration { + self.inner.elapsed() + } + + pub fn handle(&self) -> Handle { + self.inner.handle() + } + + pub fn create_node(&self) -> NodeBuilder { + self.inner.create_node() + } + + pub fn pause(&self, node: NodeId) { + self.inner.pause(node); + } + + pub fn resume(&self, node: NodeId) { + self.inner.resume(node); + } + + pub fn spawn_on(&self, node: NodeId, future: F) -> JoinHandle + where + F: Future + Send + 'static, + F::Output: Send + 'static, + { + self.inner.spawn_on(node, future) + } + + pub fn check_determinism(seed: u64, make_future: fn() -> F) -> F::Output + where + F: Future + 'static, + F::Output: Send + 'static, + { + spacetimedb_runtime::sim_std::check_determinism(seed, make_future) + } + + pub fn check_determinism_with(seed: u64, make_future: M) -> F::Output + where + M: Fn() -> F + Clone + Send + 'static, + F: Future + 'static, + F::Output: Send + 'static, + { + spacetimedb_runtime::sim_std::check_determinism(seed, make_future) + } +} +#[allow(dead_code)] +pub(crate) fn decision_source(seed: u64) -> Rng { + Rng::new(seed) +} diff --git a/crates/dst/src/sim/snapshot.rs b/crates/dst/src/sim/snapshot.rs new file mode 100644 index 00000000000..13c0e3a43c3 --- /dev/null +++ b/crates/dst/src/sim/snapshot.rs @@ -0,0 +1,287 @@ +//! In-memory snapshot storage with deterministic fault injection. +//! +//! This is intentionally a semantic snapshot seam, not a filesystem facade. It +//! keeps DST snapshot bytes inside controlled memory storage, while still using +//! the same snapshot capture/restore shape as production. + +use std::{ops::Range, sync::Arc}; + +use spacetimedb_durability::TxOffset; +use spacetimedb_lib::Identity; +use spacetimedb_snapshot::{ + BoxedPendingSnapshot, CompressionStats, MemorySnapshotRepository, PendingSnapshot, ReconstructedSnapshot, + SnapshotError, SnapshotRepo, SnapshotStore, +}; +use spacetimedb_table::{blob_store::BlobStore, page_pool::PagePool, table::Table}; + +use crate::sim::storage_faults::{ + is_injected_fault_text, StorageFaultConfig, StorageFaultController, StorageFaultDomain, StorageFaultKind, + StorageFaultSummary, +}; + +pub(crate) type SnapshotFaultConfig = StorageFaultConfig; + +/// Returns true if `text` contains an error created by this snapshot fault layer. +pub(crate) fn is_injected_snapshot_error_text(text: &str) -> bool { + is_injected_fault_text(StorageFaultDomain::Snapshot, text) +} + +pub(crate) struct SnapshotRestoreRepo { + pub(crate) store: Option>, + pub(crate) restored_snapshot_offset: Option, + pub(crate) latest_snapshot_offset: Option, +} + +/// In-memory snapshot repository wrapped with deterministic operation-level faults. +/// +/// The bytes/pages are written and read by `spacetimedb-snapshot`; this wrapper +/// only decides whether a DST operation reaches that repository. That keeps +/// restore semantics aligned with production without requiring the +/// Tokio-backed `SnapshotWorker` or the host filesystem inside the simulator. +/// +/// This is the intended boundary for the current DST target. It exercises +/// capture/restore behavior, retry classification, and replay correctness. It +/// does not model torn snapshot pages or byte-level corruption. +#[derive(Clone)] +pub(crate) struct BuggifiedSnapshotRepo { + repo: Arc, + faults: StorageFaultController, +} + +impl BuggifiedSnapshotRepo { + pub(crate) fn new(config: SnapshotFaultConfig) -> anyhow::Result { + Ok(Self { + repo: Arc::new(MemorySnapshotRepository::new(Identity::ZERO, 0)), + faults: StorageFaultController::new(config, StorageFaultDomain::Snapshot), + }) + } + + pub(crate) fn enable_faults(&self) { + self.faults.enable(); + } + + pub(crate) fn fault_summary(&self) -> StorageFaultSummary { + self.faults.summary() + } + + pub(crate) fn with_faults_suspended(&self, f: impl FnOnce() -> T) -> T { + self.faults.with_suspended(f) + } + + pub(crate) fn latest_snapshot_unfaulted(&self) -> Result, String> { + self.with_faults_suspended(|| { + self.repo + .latest_snapshot() + .map_err(|err| format!("snapshot metadata read failed: {err}")) + }) + } + + pub(crate) fn repo_for_restore(&self, durable_offset: Option) -> Result { + let latest_snapshot_offset = self.latest_snapshot_unfaulted()?; + self.faults.maybe_latency(); + self.inject(StorageFaultKind::Metadata)?; + let Some(durable_offset) = durable_offset else { + return Ok(SnapshotRestoreRepo { + store: None, + restored_snapshot_offset: None, + latest_snapshot_offset, + }); + }; + let restored_snapshot_offset = self + .repo + .latest_snapshot_older_than(durable_offset) + .map_err(|err| format!("snapshot metadata before restore failed: {err}"))?; + if restored_snapshot_offset.is_none() { + return Ok(SnapshotRestoreRepo { + store: None, + restored_snapshot_offset, + latest_snapshot_offset, + }); + } + + self.inject(StorageFaultKind::Open)?; + self.inject(StorageFaultKind::Read)?; + Ok(SnapshotRestoreRepo { + store: Some(self.repo.clone()), + restored_snapshot_offset, + latest_snapshot_offset, + }) + } + + fn inject(&self, kind: StorageFaultKind) -> Result<(), String> { + self.faults.maybe_error(kind).map_err(|err| err.to_string()) + } +} + +impl SnapshotStore for BuggifiedSnapshotRepo { + fn database_identity(&self) -> Identity { + self.repo.database_identity() + } + + fn capture_snapshot<'db>( + &self, + tables: &mut dyn Iterator, + blobs: &'db dyn BlobStore, + tx_offset: TxOffset, + ) -> Result { + self.faults.maybe_latency(); + self.faults + .maybe_error(StorageFaultKind::NoSpace) + .map_err(SnapshotError::Io)?; + self.faults + .maybe_error(StorageFaultKind::Open) + .map_err(SnapshotError::Io)?; + self.faults + .maybe_error(StorageFaultKind::Metadata) + .map_err(SnapshotError::Io)?; + self.faults + .maybe_error(StorageFaultKind::Write) + .map_err(SnapshotError::Io)?; + self.faults + .maybe_error(StorageFaultKind::Fsync) + .map_err(SnapshotError::Io)?; + self.repo.capture_snapshot(tables, blobs, tx_offset) + } + + fn read_snapshot(&self, tx_offset: TxOffset, page_pool: &PagePool) -> Result { + self.faults.maybe_latency(); + self.faults + .maybe_error(StorageFaultKind::Open) + .map_err(SnapshotError::Io)?; + self.faults + .maybe_error(StorageFaultKind::Read) + .map_err(SnapshotError::Io)?; + self.repo.read_snapshot(tx_offset, page_pool) + } + + fn latest_snapshot_older_than(&self, upper_bound: TxOffset) -> Result, SnapshotError> { + self.faults.maybe_latency(); + self.faults + .maybe_error(StorageFaultKind::NoSpace) + .map_err(SnapshotError::Io)?; + self.faults + .maybe_error(StorageFaultKind::Metadata) + .map_err(SnapshotError::Io)?; + self.repo.latest_snapshot_older_than(upper_bound) + } + + fn latest_snapshot(&self) -> Result, SnapshotError> { + self.faults.maybe_latency(); + self.faults + .maybe_error(StorageFaultKind::NoSpace) + .map_err(SnapshotError::Io)?; + self.faults + .maybe_error(StorageFaultKind::Metadata) + .map_err(SnapshotError::Io)?; + self.repo.latest_snapshot() + } + + fn invalidate_newer_snapshots(&self, upper_bound: TxOffset) -> Result<(), SnapshotError> { + self.faults.maybe_latency(); + self.faults + .maybe_error(StorageFaultKind::NoSpace) + .map_err(SnapshotError::Io)?; + self.faults + .maybe_error(StorageFaultKind::Metadata) + .map_err(SnapshotError::Io)?; + self.repo.invalidate_newer_snapshots(upper_bound) + } + + fn invalidate_snapshot(&self, tx_offset: TxOffset) -> Result<(), SnapshotError> { + self.faults.maybe_latency(); + self.faults + .maybe_error(StorageFaultKind::NoSpace) + .map_err(SnapshotError::Io)?; + self.faults + .maybe_error(StorageFaultKind::Metadata) + .map_err(SnapshotError::Io)?; + self.repo.invalidate_snapshot(tx_offset) + } +} + +struct BuggifiedPendingSnapshot { + tx_offset: TxOffset, +} + +impl PendingSnapshot for BuggifiedPendingSnapshot { + fn sync_all(self: Box) -> Result { + Ok(self.tx_offset) + } +} + +impl SnapshotRepo for BuggifiedSnapshotRepo { + type Pending = BoxedPendingSnapshot; + + fn create_snapshot<'db>( + &self, + tables: &mut dyn Iterator, + blobs: &'db dyn BlobStore, + tx_offset: TxOffset, + ) -> Result { + self.capture_snapshot(tables, blobs, tx_offset)?; + Ok(Box::new(BuggifiedPendingSnapshot { tx_offset })) + } + + fn compress_snapshots(&self, _stats: &mut CompressionStats, _range: Range) -> Result<(), SnapshotError> { + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use crate::{config::CommitlogFaultProfile, sim}; + + use super::*; + + fn no_faults() -> SnapshotFaultConfig { + SnapshotFaultConfig::for_profile(CommitlogFaultProfile::Off) + } + + fn always_metadata_error() -> SnapshotFaultConfig { + SnapshotFaultConfig { + metadata_error_prob: 1.0, + ..SnapshotFaultConfig::for_profile(CommitlogFaultProfile::Default) + } + } + + #[test] + fn repo_without_snapshots_is_not_used_for_restore() { + let mut runtime = sim::Runtime::new(42).unwrap(); + runtime.block_on(async { + let repo = BuggifiedSnapshotRepo::new(no_faults()).unwrap(); + + assert!(repo.repo_for_restore(Some(0)).unwrap().store.is_none()); + }) + } + + #[test] + fn injected_metadata_error_is_counted_and_recognizable() { + let mut runtime = sim::Runtime::new(42).unwrap(); + runtime.block_on(async { + let repo = BuggifiedSnapshotRepo::new(always_metadata_error()).unwrap(); + repo.enable_faults(); + + let err = match repo.repo_for_restore(Some(0)) { + Ok(_) => panic!("expected injected snapshot metadata error"), + Err(err) => err, + }; + + assert!(is_injected_snapshot_error_text(&err)); + assert_eq!(repo.fault_summary().metadata_error, 1); + }) + } + + #[test] + fn suspended_faults_allow_restore_probe() { + let mut runtime = sim::Runtime::new(42).unwrap(); + runtime.block_on(async { + let repo = BuggifiedSnapshotRepo::new(always_metadata_error()).unwrap(); + repo.enable_faults(); + + let restore = repo.with_faults_suspended(|| repo.repo_for_restore(Some(0))); + + assert!(restore.unwrap().store.is_none()); + assert_eq!(repo.fault_summary().metadata_error, 0); + }) + } +} diff --git a/crates/dst/src/sim/storage_faults.rs b/crates/dst/src/sim/storage_faults.rs new file mode 100644 index 00000000000..a1c59e5ca71 --- /dev/null +++ b/crates/dst/src/sim/storage_faults.rs @@ -0,0 +1,372 @@ +//! Shared storage fault-injection primitives for DST simulation helpers. +//! +//! Fault decisions use [`spacetimedb_runtime::sim::Handle::buggify_with_prob`] +//! so they are gated by the runtime's centralized buggify flag. + +use std::{ + io, + sync::{ + atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering}, + Arc, + }, + time::Duration, +}; + +use crate::config::CommitlogFaultProfile; + +const INJECTED_ERROR_PREFIX: &str = "dst injected "; + +pub(crate) fn is_injected_fault_text(domain: StorageFaultDomain, text: &str) -> bool { + text.contains(&format!("{INJECTED_ERROR_PREFIX}{} ", domain.label())) +} + +/// API-level storage fault profile for DST-only storage wrappers. +#[derive(Clone, Copy, Debug)] +pub(crate) struct StorageFaultConfig { + pub(crate) profile: CommitlogFaultProfile, + pub(crate) latency_prob: f64, + pub(crate) long_latency_prob: f64, + pub(crate) short_io_prob: f64, + pub(crate) read_error_prob: f64, + pub(crate) write_error_prob: f64, + pub(crate) flush_error_prob: f64, + pub(crate) fsync_error_prob: f64, + pub(crate) open_error_prob: f64, + pub(crate) metadata_error_prob: f64, + pub(crate) max_short_io_divisor: usize, + pub(crate) no_space_prob: f64, + pub(crate) partial_failure_prob: f64, +} + +impl StorageFaultConfig { + pub(crate) fn for_profile(profile: CommitlogFaultProfile) -> Self { + match profile { + CommitlogFaultProfile::Off => Self { + profile, + latency_prob: 0.0, + long_latency_prob: 0.0, + short_io_prob: 0.0, + read_error_prob: 0.0, + write_error_prob: 0.0, + flush_error_prob: 0.0, + fsync_error_prob: 0.0, + open_error_prob: 0.0, + metadata_error_prob: 0.0, + max_short_io_divisor: 2, + no_space_prob: 0.0, + partial_failure_prob: 0.0, + }, + // Realistic rare faults: ~1 in 1000 latency, ~1 in 10000 short I/O / errors. + CommitlogFaultProfile::Light => Self { + profile, + latency_prob: 0.001, + long_latency_prob: 0.0001, + short_io_prob: 0.0001, + read_error_prob: 0.0001, + write_error_prob: 0.0001, + flush_error_prob: 0.0001, + fsync_error_prob: 0.0001, + open_error_prob: 0.0001, + metadata_error_prob: 0.0001, + max_short_io_divisor: 2, + no_space_prob: 0.0001, + partial_failure_prob: 0.0001, + }, + // Moderate rare faults: ~1 in 500 latency, ~1 in 5000 short I/O / errors. + CommitlogFaultProfile::Default => Self { + profile, + latency_prob: 0.002, + long_latency_prob: 0.0002, + short_io_prob: 0.0002, + read_error_prob: 0.0002, + write_error_prob: 0.0002, + flush_error_prob: 0.0002, + fsync_error_prob: 0.0002, + open_error_prob: 0.0002, + metadata_error_prob: 0.0002, + max_short_io_divisor: 2, + no_space_prob: 0.0002, + partial_failure_prob: 0.0002, + }, + // Stress test: ~1 in 10 operations see a fault. + CommitlogFaultProfile::Aggressive => Self { + profile, + latency_prob: 0.10, + long_latency_prob: 0.02, + short_io_prob: 0.02, + read_error_prob: 0.01, + write_error_prob: 0.01, + flush_error_prob: 0.01, + fsync_error_prob: 0.01, + open_error_prob: 0.01, + metadata_error_prob: 0.01, + max_short_io_divisor: 2, + no_space_prob: 0.01, + partial_failure_prob: 0.01, + }, + } + } +} + +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub(crate) struct StorageFaultSummary { + pub(crate) profile: CommitlogFaultProfile, + pub(crate) latency: usize, + pub(crate) short_read: usize, + pub(crate) short_write: usize, + pub(crate) read_error: usize, + pub(crate) write_error: usize, + pub(crate) flush_error: usize, + pub(crate) fsync_error: usize, + pub(crate) open_error: usize, + pub(crate) metadata_error: usize, + pub(crate) no_space: usize, + pub(crate) partial_failure: usize, +} + +#[derive(Clone, Copy, Debug)] +pub(crate) enum StorageFaultDomain { + Disk, + Snapshot, +} + +impl StorageFaultDomain { + fn label(self) -> &'static str { + match self { + Self::Disk => "disk", + Self::Snapshot => "snapshot", + } + } +} + +#[derive(Clone)] +pub(crate) struct StorageFaultController { + config: StorageFaultConfig, + domain: StorageFaultDomain, + counters: Arc, + handle: Option, + suspended: Arc, +} + +impl StorageFaultController { + pub(crate) fn new(config: StorageFaultConfig, domain: StorageFaultDomain) -> Self { + Self { + config, + domain, + counters: Arc::default(), + handle: crate::sim::current_handle(), + suspended: Arc::new(AtomicUsize::new(0)), + } + } + + pub(crate) fn enable(&self) { + if let Some(handle) = &self.handle { + handle.enable_buggify(); + } + } + + pub(crate) fn with_suspended(&self, f: impl FnOnce() -> T) -> T { + self.suspended.fetch_add(1, Ordering::Relaxed); + let _guard = SuspendFaultsGuard { + suspended: self.suspended.clone(), + }; + f() + } + + pub(crate) fn maybe_latency(&self) { + if self.sample_latency(self.config.latency_prob) { + self.counters.latency.fetch_add(1, Ordering::Relaxed); + let latency = if self.sample_latency(self.config.long_latency_prob) { + Duration::from_millis(25) + } else { + Duration::from_millis(1) + }; + if let Some(handle) = &self.handle { + handle.advance(latency); + } + } + } + + pub(crate) fn maybe_error(&self, kind: StorageFaultKind) -> io::Result<()> { + let prob = kind.probability(&self.config); + if self.sample(prob) { + kind.counter(&self.counters).fetch_add(1, Ordering::Relaxed); + return Err(io::Error::new(kind.error_kind(), kind.message(self.domain))); + } + Ok(()) + } + + pub(crate) fn check_pending_error(&self, kind: StorageFaultKind) -> io::Result<()> { + if self.counters.pending_error.swap(false, Ordering::Relaxed) { + kind.counter(&self.counters).fetch_add(1, Ordering::Relaxed); + self.counters.partial_failure.fetch_add(1, Ordering::Relaxed); + return Err(io::Error::new(kind.error_kind(), kind.message(self.domain))); + } + Ok(()) + } + + pub(crate) fn arm_pending_error(&self) { + self.counters.pending_error.store(true, Ordering::Relaxed); + } + + pub(crate) fn sample_partial_failure(&self) -> bool { + if !self.active() || self.config.partial_failure_prob <= 0.0 { + return false; + } + match &self.handle { + Some(handle) => handle.buggify_with_prob(self.config.partial_failure_prob), + None => false, + } + } + + pub(crate) fn maybe_short_len(&self, len: usize, kind: ShortIoKind) -> usize { + if len <= 1 { + return len; + } + if !self.sample(self.config.short_io_prob) { + return len; + } + kind.counter(&self.counters).fetch_add(1, Ordering::Relaxed); + let divisor = self.config.max_short_io_divisor.max(2); + (len / divisor).max(1) + } + + pub(crate) fn summary(&self) -> StorageFaultSummary { + StorageFaultSummary { + profile: self.config.profile, + latency: self.counters.latency.load(Ordering::Relaxed) as usize, + short_read: self.counters.short_read.load(Ordering::Relaxed) as usize, + short_write: self.counters.short_write.load(Ordering::Relaxed) as usize, + read_error: self.counters.read_error.load(Ordering::Relaxed) as usize, + write_error: self.counters.write_error.load(Ordering::Relaxed) as usize, + flush_error: self.counters.flush_error.load(Ordering::Relaxed) as usize, + fsync_error: self.counters.fsync_error.load(Ordering::Relaxed) as usize, + open_error: self.counters.open_error.load(Ordering::Relaxed) as usize, + metadata_error: self.counters.metadata_error.load(Ordering::Relaxed) as usize, + no_space: self.counters.no_space.load(Ordering::Relaxed) as usize, + partial_failure: self.counters.partial_failure.load(Ordering::Relaxed) as usize, + } + } + + fn active(&self) -> bool { + self.suspended.load(Ordering::Relaxed) == 0 + } + + fn sample(&self, probability: f64) -> bool { + if probability <= 0.0 || !self.active() { + return false; + } + match &self.handle { + Some(handle) => handle.buggify_with_prob(probability), + None => false, + } + } + + fn sample_latency(&self, probability: f64) -> bool { + if probability <= 0.0 { + return false; + } + match &self.handle { + Some(handle) => handle.buggify_with_prob(probability), + None => false, + } + } +} + +struct SuspendFaultsGuard { + suspended: Arc, +} + +impl Drop for SuspendFaultsGuard { + fn drop(&mut self) { + self.suspended.fetch_sub(1, Ordering::Relaxed); + } +} + +#[derive(Debug, Default)] +struct FaultCounters { + latency: AtomicU64, + short_read: AtomicU64, + short_write: AtomicU64, + read_error: AtomicU64, + write_error: AtomicU64, + flush_error: AtomicU64, + fsync_error: AtomicU64, + open_error: AtomicU64, + metadata_error: AtomicU64, + no_space: AtomicU64, + partial_failure: AtomicU64, + pending_error: AtomicBool, +} + +#[derive(Clone, Copy)] +pub(crate) enum ShortIoKind { + Read, + Write, +} + +impl ShortIoKind { + fn counter(self, counters: &FaultCounters) -> &AtomicU64 { + match self { + Self::Read => &counters.short_read, + Self::Write => &counters.short_write, + } + } +} + +#[derive(Clone, Copy)] +pub(crate) enum StorageFaultKind { + Read, + Write, + Flush, + Fsync, + Open, + Metadata, + NoSpace, +} + +impl StorageFaultKind { + fn probability(self, config: &StorageFaultConfig) -> f64 { + match self { + Self::Read => config.read_error_prob, + Self::Write => config.write_error_prob, + Self::Flush => config.flush_error_prob, + Self::Fsync => config.fsync_error_prob, + Self::Open => config.open_error_prob, + Self::Metadata => config.metadata_error_prob, + Self::NoSpace => config.no_space_prob, + } + } + + fn counter(self, counters: &FaultCounters) -> &AtomicU64 { + match self { + Self::Read => &counters.read_error, + Self::Write => &counters.write_error, + Self::Flush => &counters.flush_error, + Self::Fsync => &counters.fsync_error, + Self::Open => &counters.open_error, + Self::Metadata => &counters.metadata_error, + Self::NoSpace => &counters.no_space, + } + } + + fn error_kind(self) -> io::ErrorKind { + match self { + Self::NoSpace => io::ErrorKind::StorageFull, + _ => io::ErrorKind::Other, + } + } + + fn message(self, domain: StorageFaultDomain) -> String { + let label = domain.label(); + match self { + Self::Read => format!("{INJECTED_ERROR_PREFIX}{label} input/output error"), + Self::Write => format!("{INJECTED_ERROR_PREFIX}{label} input/output error"), + Self::Flush => format!("{INJECTED_ERROR_PREFIX}{label} input/output error"), + Self::Fsync => format!("{INJECTED_ERROR_PREFIX}{label} input/output error"), + Self::Open => format!("{INJECTED_ERROR_PREFIX}{label} input/output error"), + Self::Metadata => format!("{INJECTED_ERROR_PREFIX}{label} input/output error"), + Self::NoSpace => format!("{INJECTED_ERROR_PREFIX}{label} no space left on device"), + } + } +} diff --git a/crates/dst/src/sim/time.rs b/crates/dst/src/sim/time.rs new file mode 100644 index 00000000000..bdeae0fbb58 --- /dev/null +++ b/crates/dst/src/sim/time.rs @@ -0,0 +1,123 @@ +//! Virtual time for the local DST simulator. + +use std::time::Duration; + +pub use spacetimedb_runtime::sim::time::TimeoutElapsed; +pub use spacetimedb_runtime::sim::Handle as TimeHandle; + +fn current_handle() -> TimeHandle { + super::current_handle().expect("sim::time used outside Runtime::block_on") +} + +pub fn try_current_handle() -> Option { + super::current_handle() +} + +pub fn now() -> Duration { + current_handle().now() +} + +pub async fn sleep(duration: Duration) { + current_handle().sleep(duration).await +} + +pub async fn timeout(duration: Duration, future: impl core::future::Future) -> Result { + current_handle().timeout(duration, future).await +} + +pub fn advance(duration: Duration) { + current_handle().advance(duration); +} + +#[cfg(test)] +mod tests { + use std::{ + sync::{Arc, Mutex}, + time::Duration, + }; + + use crate::sim; + + #[test] + fn sleep_fast_forwards_virtual_time() { + let mut runtime = sim::Runtime::new(101).unwrap(); + + runtime.block_on(async { + assert_eq!(super::now(), Duration::ZERO); + super::sleep(Duration::from_millis(5)).await; + assert_eq!(super::now(), Duration::from_millis(5)); + }); + } + + #[test] + fn shorter_timer_wakes_first() { + let mut runtime = sim::Runtime::new(102).unwrap(); + let handle = runtime.handle(); + let order = Arc::new(Mutex::new(Vec::new())); + + runtime.block_on({ + let order = Arc::clone(&order); + async move { + let slow_order = Arc::clone(&order); + let slow = handle.spawn_on(sim::NodeId::MAIN, async move { + super::sleep(Duration::from_millis(10)).await; + slow_order.lock().expect("order poisoned").push(10); + }); + + let fast_order = Arc::clone(&order); + let fast = handle.spawn_on(sim::NodeId::MAIN, async move { + super::sleep(Duration::from_millis(3)).await; + fast_order.lock().expect("order poisoned").push(3); + }); + + fast.await.expect("fast timer task should complete"); + slow.await.expect("slow timer task should complete"); + } + }); + + assert_eq!(*order.lock().expect("order poisoned"), vec![3, 10]); + assert_eq!(runtime.elapsed(), Duration::from_millis(10)); + } + + #[test] + fn explicit_advance_moves_virtual_time() { + let mut runtime = sim::Runtime::new(103).unwrap(); + + runtime.block_on(async { + super::advance(Duration::from_millis(7)); + assert_eq!(super::now(), Duration::from_millis(7)); + }); + } + + #[test] + fn timeout_returns_future_output_before_deadline() { + let mut runtime = sim::Runtime::new(104).unwrap(); + + let output = runtime.block_on(async { + super::timeout(Duration::from_millis(10), async { + super::sleep(Duration::from_millis(3)).await; + 9 + }) + .await + }); + + assert_eq!(output, Ok(9)); + assert_eq!(runtime.elapsed(), Duration::from_millis(3)); + } + + #[test] + fn timeout_expires_at_virtual_deadline() { + let mut runtime = sim::Runtime::new(105).unwrap(); + + let output = runtime.block_on(async { + super::timeout(Duration::from_millis(4), async { + super::sleep(Duration::from_millis(20)).await; + 9 + }) + .await + }); + + assert_eq!(output.unwrap_err().duration(), Duration::from_millis(4)); + assert_eq!(runtime.elapsed(), Duration::from_millis(4)); + } +} diff --git a/crates/dst/src/targets/descriptor.rs b/crates/dst/src/targets/descriptor.rs new file mode 100644 index 00000000000..1a00c77a937 --- /dev/null +++ b/crates/dst/src/targets/descriptor.rs @@ -0,0 +1,40 @@ +//! Target descriptor layer used by the CLI. + +use std::{future::Future, pin::Pin}; + +use crate::{config::RunConfig, workload::table_ops::TableScenarioId}; + +/// Descriptor contract: CLI talks to this, not per-target ad hoc handlers. +pub trait TargetDescriptor { + const NAME: &'static str; + type Scenario; + + fn prepare(_seed: u64, _scenario: &Self::Scenario, _config: &RunConfig) -> anyhow::Result<()> { + Ok(()) + } + + fn run_streaming(seed: u64, scenario: Self::Scenario, config: RunConfig) -> TargetRunFuture; +} + +pub type TargetRunFuture = Pin>>>; + +pub struct RelationalDbCommitlogDescriptor; + +impl TargetDescriptor for RelationalDbCommitlogDescriptor { + const NAME: &'static str = "relational-db-commitlog"; + type Scenario = TableScenarioId; + + fn run_streaming(seed: u64, scenario: Self::Scenario, config: RunConfig) -> TargetRunFuture { + Box::pin(async move { + let outcome = + crate::targets::relational_db_commitlog::run_generated_with_config_and_scenario(seed, scenario, config) + .await?; + Ok(format!( + "ok target={} seed={} steps={}", + Self::NAME, + seed, + outcome.final_row_counts.iter().sum::(), + )) + }) + } +} diff --git a/crates/dst/src/targets/mod.rs b/crates/dst/src/targets/mod.rs new file mode 100644 index 00000000000..51a483d73a2 --- /dev/null +++ b/crates/dst/src/targets/mod.rs @@ -0,0 +1,4 @@ +//! Concrete simulation targets. + +pub mod descriptor; +pub mod relational_db_commitlog; diff --git a/crates/dst/src/targets/relational_db_commitlog.rs b/crates/dst/src/targets/relational_db_commitlog.rs new file mode 100644 index 00000000000..5a116a6e3aa --- /dev/null +++ b/crates/dst/src/targets/relational_db_commitlog.rs @@ -0,0 +1,848 @@ +//! Simple RelationalDB DST target — table operations only. + +use std::ops::Bound; +use std::sync::Arc; + +use spacetimedb_commitlog::repo::mem::Memory; +use spacetimedb_core::{ + db::persistence::{DiskSizeFn, Persistence}, + db::relational_db::{MutTx as RelMutTx, RelationalDB, Tx as RelTx}, + error::DBError, + messages::control_db::HostType, +}; +use spacetimedb_datastore::{execution_context::Workload, traits::IsolationLevel}; +use spacetimedb_durability::local::Options as DurabilityOpts; +use spacetimedb_durability::Local as DurabilityLocal; +use spacetimedb_lib::{ + db::auth::{StAccess, StTableType}, + Identity, +}; +use spacetimedb_primitives::TableId; +use spacetimedb_runtime::Handle as RuntimeHandle; +use spacetimedb_sats::AlgebraicValue; +use spacetimedb_schema::{ + def::BTreeAlgorithm, + schema::{ColumnSchema, ConstraintSchema, IndexSchema, TableSchema}, + table_name::TableName, +}; +use spacetimedb_snapshot::SnapshotStore; +use spacetimedb_table::page_pool::PagePool; +use tracing::{info, trace}; + +use crate::{ + client::SessionId, + config::{CommitlogFaultProfile, RunConfig}, + core::{self, TargetEngine}, + properties::{ + PropertyRuntime, TableMutation, TableObservation, TargetPropertyAccess, + }, + schema::{SchemaPlan, SimRow}, + sim::{ + commitlog::{CommitlogFaultConfig, FaultableRepo}, + fork_seed, + snapshot::BuggifiedSnapshotRepo, + storage_faults::StorageFaultConfig, + Rng, + }, + workload::table_ops::{ + ConnectionWriteState, TableErrorKind, TableOperation, TableScenario, TableScenarioId, TableWorkloadInteraction, + TableWorkloadOutcome, TableWorkloadSource, + }, +}; + +pub type RelationalDbTableOutcome = TableWorkloadOutcome; + +pub async fn run_generated_with_config_and_scenario( + seed: u64, + scenario: TableScenarioId, + config: RunConfig, +) -> anyhow::Result { + let num_connections = { + let rng = Rng::new(fork_seed(seed, 121)); + rng.index(3) + 1 + }; + let schema_rng = Rng::new(fork_seed(seed, 122)); + let schema = scenario.generate_schema(&schema_rng); + let source = TableWorkloadSource::new( + seed, + scenario, + schema.clone(), + num_connections, + config.max_interactions_or_default(usize::MAX), + ); + + let sim_handle = crate::sim::current_handle().expect("must run inside sim Runtime::block_on"); + let rt_handle = RuntimeHandle::simulation(sim_handle.clone()); + + // Build faulty commitlog + persistence + let clog_repo = FaultableRepo::new( + Memory::unlimited(), + CommitlogFaultConfig::for_profile(CommitlogFaultProfile::Default), + ); + let local = DurabilityLocal::open_with_repo(clog_repo, rt_handle.clone(), DurabilityOpts::default())?; + let history = local.as_history(); + let durability = Arc::new(local); + + // Build faulty snapshot store + let snap_repo = Arc::new(BuggifiedSnapshotRepo::new( + StorageFaultConfig::for_profile(CommitlogFaultProfile::Default), + )?) as Arc; + + // Enable buggify after setup so initial replay is fault-free + sim_handle.enable_buggify(); + + let persistence = Persistence { + durability, + disk_size: { + use std::io; + use spacetimedb_commitlog::repo::SizeOnDisk; + Arc::new(|| io::Result::Ok(SizeOnDisk { total_bytes: 0, total_blocks: 0 })) as DiskSizeFn + }, + snapshot_store: Some(snap_repo), + snapshots: None, + runtime: rt_handle, + }; + + let engine = RelationalDbEngine::new(seed, &schema, num_connections, history, Some(persistence))?; + let properties = PropertyRuntime::for_table_workload(scenario, schema.clone(), num_connections); + let outcome = core::run_streaming(source, engine, properties, config).await?; + info!( + applied_steps = outcome.final_row_counts.iter().sum::(), + "relational_db_table complete" + ); + Ok(outcome) +} + +struct RelationalDbEngine { + db: Option, + execution: ConnectionWriteState, + read_tx_by_connection: Vec>, + base_schema: SchemaPlan, + base_table_ids: Vec, + step: usize, +} + +impl RelationalDbEngine { + fn new>( + _seed: u64, schema: &SchemaPlan, num_connections: usize, + history: H, persistence: Option, + ) -> anyhow::Result { + let (db, connected_clients) = RelationalDB::open( + Identity::ZERO, + Identity::ZERO, + history, + persistence, + None, + PagePool::new_for_test(), + )?; + assert_eq!(connected_clients.len(), 0); + db.with_auto_commit(Workload::Internal, |tx| { + db.set_initialized(tx, spacetimedb_datastore::traits::Program::empty(HostType::Wasm.into())) + })?; + + let mut engine = Self { + db: Some(db), + execution: ConnectionWriteState::new(num_connections), + read_tx_by_connection: (0..num_connections).map(|_| None).collect(), + base_schema: schema.clone(), + base_table_ids: Vec::with_capacity(schema.tables.len()), + step: 0, + }; + engine.install_base_schema().map_err(anyhow::Error::msg)?; + Ok(engine) + } + + fn db(&self) -> Result<&RelationalDB, String> { + self.db.as_ref().ok_or_else(|| "relational db not initialized".to_string()) + } + + fn install_base_schema(&mut self) -> Result<(), String> { + let mut tx = self + .db()? + .begin_mut_tx(IsolationLevel::Serializable, Workload::ForTests); + for table in &self.base_schema.tables { + let columns = table + .columns + .iter() + .enumerate() + .map(|(idx, col)| ColumnSchema::for_test(idx as u16, &col.name, col.ty.clone())) + .collect::>(); + let mut indexes = vec![IndexSchema::for_test( + format!("{}_id_idx", table.name), + BTreeAlgorithm::from(0), + )]; + for cols in &table.extra_indexes { + let cols_name = cols.iter().map(|col| format!("c{col}")).collect::>().join("_"); + indexes.push(IndexSchema::for_test( + format!("{}_{}_idx", table.name, cols_name), + BTreeAlgorithm::from(cols.iter().copied().collect::()), + )); + } + let constraints = vec![ConstraintSchema::unique_for_test( + format!("{}_id_unique", table.name), + 0, + )]; + let table_id = self + .db()? + .create_table( + &mut tx, + TableSchema::new( + TableId::SENTINEL, + TableName::for_test(&table.name), + None, + columns, + indexes, + constraints, + vec![], + StTableType::User, + StAccess::Public, + None, + Some(0.into()), + false, + None, + ), + ) + .map_err(|err| format!("create table '{}' failed: {err}", table.name))?; + self.base_table_ids.push(table_id); + } + let _ = self + .db()? + .commit_tx(tx) + .map_err(|err| format!("install base schema commit failed: {err}"))?; + Ok(()) + } + + fn execute(&mut self, interaction: &TableWorkloadInteraction) -> Result { + self.step = self.step.saturating_add(1); + self.execute_table_op(interaction) + } + + fn execute_table_op(&mut self, interaction: &TableWorkloadInteraction) -> Result { + trace!(step = self.step, op = ?interaction.op, "table interaction"); + let observation = self.execute_table_op_inner(&interaction.op)?; + Ok(observation) + } + + fn execute_table_op_inner(&mut self, op: &TableOperation) -> Result { + match op { + TableOperation::BeginTx { conn } => self.begin_write_tx(*conn), + TableOperation::BeginReadTx { conn } => { + self.execution.ensure_known_connection(*conn)?; + if self.execution.tx_by_connection[conn.as_index()].is_some() { + return Err(format!("connection {conn} already has open write transaction")); + } + if self.read_tx_by_connection[conn.as_index()].is_some() { + return Err(format!("connection {conn} already has open read transaction")); + } + let tx = self.db()?.begin_tx(Workload::ForTests); + self.read_tx_by_connection[conn.as_index()] = Some(tx); + Ok(TableObservation::Applied) + } + TableOperation::ReleaseReadTx { conn } => { + self.execution.ensure_known_connection(*conn)?; + let tx = self.read_tx_by_connection[conn.as_index()] + .take() + .ok_or_else(|| format!("connection {conn} has no read transaction to release"))?; + let _ = self.db()?.release_tx(tx); + Ok(TableObservation::Applied) + } + TableOperation::CommitTx { conn } => { + self.execution.ensure_writer_owner(*conn, "commit")?; + let tx = self.execution.tx_by_connection[conn.as_index()] + .take() + .ok_or_else(|| format!("connection {conn} has no transaction to commit"))?; + let _ = self + .db()? + .commit_tx(tx) + .map_err(|err| format!("commit interaction failed: {err}"))?; + self.execution.active_writer = None; + Ok(TableObservation::CommitOrRollback) + } + TableOperation::RollbackTx { conn } => { + self.execution.ensure_writer_owner(*conn, "rollback")?; + let tx = self.execution.tx_by_connection[conn.as_index()] + .take() + .ok_or_else(|| format!("connection {conn} has no transaction to rollback"))?; + let _ = self.db()?.rollback_mut_tx(tx); + self.execution.active_writer = None; + Ok(TableObservation::CommitOrRollback) + } + TableOperation::InsertRows { conn, table, rows } => self.execute_insert_rows(*conn, *table, rows), + TableOperation::DeleteRows { conn, table, rows } => self.execute_delete_rows(*conn, *table, rows), + TableOperation::AddColumn { + conn, + table, + column, + default, + } => { + let table_id = self.table_id_for_index(*table)?; + let column_idx = self.base_schema.tables[*table].columns.len() as u16; + let mut columns = self.base_schema.tables[*table] + .columns + .iter() + .enumerate() + .map(|(idx, existing)| ColumnSchema::for_test(idx as u16, &existing.name, existing.ty.clone())) + .collect::>(); + columns.push(ColumnSchema::for_test(column_idx, &column.name, column.ty.clone())); + self.with_mut_tx(*conn, |engine, tx| { + let new_table_id = engine + .db()? + .add_columns_to_table(tx, table_id, columns.clone(), vec![default.clone()]) + .map_err(|err| format!("add column failed: {err}"))?; + Ok(new_table_id) + })?; + Ok(TableObservation::Applied) + } + TableOperation::AddIndex { conn, table, cols } => { + let table_id = self.table_id_for_index(*table)?; + self.with_mut_tx(*conn, |engine, tx| { + let mut schema = IndexSchema::for_test( + format!( + "{}_dst_added_{}_idx", + engine.base_schema.tables[*table].name, + engine.base_schema.tables[*table].extra_indexes.len() + ), + BTreeAlgorithm::from(cols.iter().copied().collect::()), + ); + schema.table_id = table_id; + engine + .db()? + .create_index(tx, schema, false) + .map_err(|err| format!("add index failed: {err}"))?; + Ok(()) + })?; + if !self.base_schema.tables[*table].extra_indexes.contains(cols) { + self.base_schema.tables[*table].extra_indexes.push(cols.clone()); + } + Ok(TableObservation::Applied) + } + TableOperation::PointLookup { conn, table, id } => { + let actual = self.lookup_base_row(*conn, *table, *id)?; + Ok(TableObservation::PointLookup { + conn: *conn, + table: *table, + id: *id, + actual, + }) + } + TableOperation::PredicateCount { + conn, + table, + col, + value, + } => { + let actual = self.count_by_col_eq_in_connection(*conn, *table, *col, value)?; + Ok(TableObservation::PredicateCount { + conn: *conn, + table: *table, + col: *col, + value: value.clone(), + actual, + }) + } + TableOperation::RangeScan { + conn, + table, + cols, + lower, + upper, + } => { + let actual = self.range_scan_in_connection(*conn, *table, cols, lower.clone(), upper.clone())?; + Ok(TableObservation::RangeScan { + conn: *conn, + table: *table, + cols: cols.clone(), + lower: lower.clone(), + upper: upper.clone(), + actual, + }) + } + TableOperation::FullScan { conn, table } => { + let actual = self.collect_rows_in_connection(*conn, *table)?; + Ok(TableObservation::FullScan { + conn: *conn, + table: *table, + actual, + }) + } + } + } + + fn begin_write_tx(&mut self, conn: SessionId) -> Result { + self.execution.ensure_known_connection(conn)?; + if self.read_tx_by_connection[conn.as_index()].is_some() { + return Err(format!("connection {conn} already has open read transaction")); + } + if self.execution.tx_by_connection[conn.as_index()].is_some() { + return Err(format!("connection {conn} already has open transaction")); + } + match self + .db()? + .try_begin_mut_tx(IsolationLevel::Serializable, Workload::ForTests) + { + Some(tx) => { + if self.execution.active_writer.is_some() || self.any_open_read_tx() { + let _ = self.db()?.rollback_mut_tx(tx); + return Err(format!( + "connection {conn} unexpectedly acquired write lock while conflicting transaction was open" + )); + } + self.execution.tx_by_connection[conn.as_index()] = Some(tx); + self.execution.active_writer = Some(conn); + Ok(TableObservation::Applied) + } + None => { + if self.execution.active_writer.is_some() || self.any_open_read_tx() { + Ok(TableObservation::ObservedError( + TableErrorKind::WriteConflict, + )) + } else { + Err(format!( + "connection {conn} failed to begin write transaction without an open conflicting lock" + )) + } + } + } + } + + fn execute_insert_rows( + &mut self, + conn: SessionId, + table: usize, + rows: &[SimRow], + ) -> Result { + let in_tx = self.is_in_write_tx(conn); + let outcome = self.with_mut_tx_observed(conn, |engine, tx| { + let mut mutations = Vec::with_capacity(rows.len()); + for row in rows { + match engine.try_insert_base_row(tx, table, row)? { + Ok(returned) => mutations.push(TableMutation::Inserted { + table, + requested: row.clone(), + returned, + }), + Err(err) if is_unique_constraint_violation(&err) => { + return Ok(Err(TableErrorKind::UniqueConstraintViolation)); + } + Err(err) => return Err(format!("insert failed: {err}")), + } + } + Ok(Ok(mutations)) + }); + self.mutation_observation(conn, in_tx, outcome) + } + + fn execute_delete_rows( + &mut self, + conn: SessionId, + table: usize, + rows: &[SimRow], + ) -> Result { + let in_tx = self.is_in_write_tx(conn); + let outcome = self.with_mut_tx_observed(conn, |engine, tx| { + let mut mutations = Vec::with_capacity(rows.len()); + for row in rows { + match engine.delete_base_row_count(tx, table, row)? { + 0 => return Ok(Err(TableErrorKind::MissingRow)), + 1 => mutations.push(TableMutation::Deleted { + table, + row: row.clone(), + }), + deleted => { + return Err(format!("delete for row={row:?} affected {deleted} rows")); + } + } + } + Ok(Ok(mutations)) + }); + self.mutation_observation(conn, in_tx, outcome) + } + + fn mutation_observation( + &mut self, + conn: SessionId, + in_tx: bool, + outcome: Result, TableErrorKind>, String>, + ) -> Result { + match outcome { + Ok(Ok(mutations)) => Ok(TableObservation::Mutated { conn, mutations, in_tx }), + Ok(Err(kind)) => Ok(TableObservation::ObservedError(kind)), + Err(err) if is_write_conflict_error(&err) => { + Ok(TableObservation::ObservedError(TableErrorKind::WriteConflict)) + } + Err(err) => Err(err), + } + } + + fn with_mut_tx_observed( + &mut self, + conn: SessionId, + mut f: impl FnMut(&mut Self, &mut RelMutTx) -> Result, String>, + ) -> Result, String> { + self.execution.ensure_known_connection(conn)?; + if self.read_tx_by_connection[conn.as_index()].is_some() { + return Err(format!("connection {conn} cannot write while read transaction is open")); + } + if self.execution.tx_by_connection[conn.as_index()].is_some() { + let mut tx = self.execution.tx_by_connection[conn.as_index()] + .take() + .ok_or_else(|| format!("connection {conn} missing transaction handle"))?; + let result = f(self, &mut tx); + self.execution.tx_by_connection[conn.as_index()] = Some(tx); + return result; + } + + if self.execution.active_writer.is_some() || self.any_open_read_tx() { + return Ok(Err(TableErrorKind::WriteConflict)); + } + + let mut tx = self + .db()? + .try_begin_mut_tx(IsolationLevel::Serializable, Workload::ForTests) + .ok_or_else(|| format!("connection {conn} failed to acquire write transaction"))?; + self.execution.active_writer = Some(conn); + let value = match f(self, &mut tx) { + Ok(Ok(value)) => value, + Ok(Err(kind)) => { + let _ = self.db()?.rollback_mut_tx(tx); + self.execution.active_writer = None; + return Ok(Err(kind)); + } + Err(err) => { + let _ = self.db()?.rollback_mut_tx(tx); + self.execution.active_writer = None; + return Err(err); + } + }; + let _ = self + .db()? + .commit_tx(tx) + .map_err(|err| format!("auto-commit write failed: {err}"))?; + self.execution.active_writer = None; + Ok(Ok(value)) + } + + fn with_mut_tx( + &mut self, + conn: SessionId, + mut f: impl FnMut(&mut Self, &mut RelMutTx) -> Result, + ) -> Result { + self.execution.ensure_known_connection(conn)?; + if self.read_tx_by_connection[conn.as_index()].is_some() { + return Err(format!("connection {conn} cannot write while read transaction is open")); + } + if self.execution.tx_by_connection[conn.as_index()].is_some() { + let mut tx = self.execution.tx_by_connection[conn.as_index()] + .take() + .ok_or_else(|| format!("connection {conn} missing transaction handle"))?; + let result = f(self, &mut tx); + self.execution.tx_by_connection[conn.as_index()] = Some(tx); + return result; + } + + if self.execution.active_writer.is_some() || self.any_open_read_tx() { + return Err(format!( + "connection {conn} cannot auto-commit write while a conflicting lock is open" + )); + } + + let mut tx = self + .db()? + .try_begin_mut_tx(IsolationLevel::Serializable, Workload::ForTests) + .ok_or_else(|| format!("connection {conn} failed to acquire write transaction"))?; + self.execution.active_writer = Some(conn); + let value = match f(self, &mut tx) { + Ok(value) => value, + Err(err) => { + let _ = self.db()?.rollback_mut_tx(tx); + self.execution.active_writer = None; + return Err(err); + } + }; + let _ = self + .db()? + .commit_tx(tx) + .map_err(|err| format!("auto-commit write failed: {err}"))?; + self.execution.active_writer = None; + Ok(value) + } + + fn try_insert_base_row( + &self, + tx: &mut RelMutTx, + table: usize, + row: &SimRow, + ) -> Result, String> { + let table_id = self.table_id_for_index(table)?; + let bsatn = row.to_bsatn().map_err(|err| err.to_string())?; + Ok(match self.db()?.insert(tx, table_id, &bsatn) { + Ok((_, row_ref, _)) => Ok(SimRow::from_product_value(row_ref.to_product_value())), + Err(err) => Err(err), + }) + } + + fn delete_base_row_count(&self, tx: &mut RelMutTx, table: usize, row: &SimRow) -> Result { + let table_id = self.table_id_for_index(table)?; + Ok(self.db()?.delete_by_rel(tx, table_id, [row.to_product_value()])) + } + + fn any_open_read_tx(&self) -> bool { + self.read_tx_by_connection.iter().any(Option::is_some) + } + + fn is_in_write_tx(&self, conn: SessionId) -> bool { + self.execution + .tx_by_connection + .get(conn.as_index()) + .is_some_and(Option::is_some) + } + + fn table_id_for_index(&self, table: usize) -> Result { + self.base_table_ids + .get(table) + .copied() + .ok_or_else(|| format!("table {table} out of range")) + } + + fn with_fresh_read_tx(&self, f: impl FnOnce(&RelationalDB, &RelTx) -> Result) -> Result { + let db = self.db()?; + let tx = db.begin_tx(Workload::ForTests); + let result = f(db, &tx); + let _ = db.release_tx(tx); + result + } + + fn collect_rows_by_id(&self, table_id: TableId) -> Result, String> { + self.with_fresh_read_tx(|db, tx| { + let mut rows = db + .iter(tx, table_id) + .map_err(|err| format!("scan failed: {err}"))? + .map(|row_ref| SimRow::from_product_value(row_ref.to_product_value())) + .collect::>(); + rows.sort_by_key(|row| row.id().unwrap_or_default()); + Ok(rows) + }) + } + + fn lookup_base_row(&self, conn: SessionId, table: usize, id: u64) -> Result, String> { + let table_id = self.table_id_for_index(table)?; + if let Some(Some(tx)) = self.execution.tx_by_connection.get(conn.as_index()) { + Ok(self + .db()? + .iter_by_col_eq_mut(tx, table_id, 0u16, &AlgebraicValue::U64(id)) + .map_err(|err| format!("in-tx lookup failed: {err}"))? + .map(|row_ref| SimRow::from_product_value(row_ref.to_product_value())) + .next()) + } else if let Some(Some(tx)) = self.read_tx_by_connection.get(conn.as_index()) { + Ok(self + .db()? + .iter_by_col_eq(tx, table_id, 0u16, &AlgebraicValue::U64(id)) + .map_err(|err| format!("read-tx lookup failed: {err}"))? + .map(|row_ref| SimRow::from_product_value(row_ref.to_product_value())) + .next()) + } else { + self.with_fresh_read_tx(|db, tx| { + Ok(db + .iter_by_col_eq(tx, table_id, 0u16, &AlgebraicValue::U64(id)) + .map_err(|err| format!("lookup failed: {err}"))? + .map(|row_ref| SimRow::from_product_value(row_ref.to_product_value())) + .next()) + }) + } + } + + fn collect_rows_in_connection(&self, conn: SessionId, table: usize) -> Result, String> { + let table_id = self.table_id_for_index(table)?; + if let Some(Some(tx)) = self.execution.tx_by_connection.get(conn.as_index()) { + let mut rows = self + .db()? + .iter_mut(tx, table_id) + .map_err(|err| format!("in-tx scan failed: {err}"))? + .map(|row_ref| SimRow::from_product_value(row_ref.to_product_value())) + .collect::>(); + rows.sort_by_key(|row| row.id().unwrap_or_default()); + Ok(rows) + } else if let Some(Some(tx)) = self.read_tx_by_connection.get(conn.as_index()) { + let mut rows = self + .db()? + .iter(tx, table_id) + .map_err(|err| format!("read-tx scan failed: {err}"))? + .map(|row_ref| SimRow::from_product_value(row_ref.to_product_value())) + .collect::>(); + rows.sort_by_key(|row| row.id().unwrap_or_default()); + Ok(rows) + } else { + self.collect_rows_by_id(table_id) + } + } + + fn count_by_col_eq_in_connection( + &self, + conn: SessionId, + table: usize, + col: u16, + value: &AlgebraicValue, + ) -> Result { + let table_id = self.table_id_for_index(table)?; + if let Some(Some(tx)) = self.execution.tx_by_connection.get(conn.as_index()) { + Ok(self + .db()? + .iter_by_col_eq_mut(tx, table_id, col, value) + .map_err(|err| format!("in-tx predicate query failed: {err}"))? + .count()) + } else if let Some(Some(tx)) = self.read_tx_by_connection.get(conn.as_index()) { + Ok(self + .db()? + .iter_by_col_eq(tx, table_id, col, value) + .map_err(|err| format!("read-tx predicate query failed: {err}"))? + .count()) + } else { + self.with_fresh_read_tx(|db, tx| { + Ok(db + .iter_by_col_eq(tx, table_id, col, value) + .map_err(|err| format!("predicate query failed: {err}"))? + .count()) + }) + } + } + + fn range_scan_in_connection( + &self, + conn: SessionId, + table: usize, + cols: &[u16], + lower: Bound, + upper: Bound, + ) -> Result, String> { + let table_id = self.table_id_for_index(table)?; + let cols_list = cols.iter().copied().collect::(); + if let Some(Some(tx)) = self.execution.tx_by_connection.get(conn.as_index()) { + let mut rows = self + .db()? + .iter_by_col_range_mut(tx, table_id, cols_list, (lower, upper)) + .map_err(|err| format!("in-tx range scan failed: {err}"))? + .map(|row_ref| SimRow::from_product_value(row_ref.to_product_value())) + .collect::>(); + rows.sort_by_key(|row| row.id().unwrap_or_default()); + Ok(rows) + } else if let Some(Some(tx)) = self.read_tx_by_connection.get(conn.as_index()) { + let mut rows = self + .db()? + .iter_by_col_range(tx, table_id, cols_list, (lower, upper)) + .map_err(|err| format!("read-tx range scan failed: {err}"))? + .map(|row_ref| SimRow::from_product_value(row_ref.to_product_value())) + .collect::>(); + rows.sort_by_key(|row| row.id().unwrap_or_default()); + Ok(rows) + } else { + self.with_fresh_read_tx(|db, tx| { + let mut rows = db + .iter_by_col_range(tx, table_id, cols_list, (lower, upper)) + .map_err(|err| format!("range scan failed: {err}"))? + .map(|row_ref| SimRow::from_product_value(row_ref.to_product_value())) + .collect::>(); + rows.sort_by_key(|row| row.id().unwrap_or_default()); + Ok(rows) + }) + } + } +} + +impl TargetEngine for RelationalDbEngine { + type Observation = TableObservation; + type Outcome = TableWorkloadOutcome; + type Error = String; + + fn execute_interaction<'a>( + &'a mut self, + interaction: &'a TableWorkloadInteraction, + ) -> impl std::future::Future> + 'a { + async move { self.execute(interaction) } + } + + fn finish(&mut self) {} + + fn collect_outcome<'a>(&'a mut self) -> impl std::future::Future> + 'a { + async move { + let mut final_rows = Vec::with_capacity(self.base_schema.tables.len()); + let mut final_row_counts = Vec::with_capacity(self.base_schema.tables.len()); + for table in 0..self.base_schema.tables.len() { + let table_id = self.table_id_for_index(table).map_err(anyhow::Error::msg)?; + let rows = self.collect_rows_by_id(table_id).map_err(anyhow::Error::msg)?; + final_row_counts.push(rows.len() as u64); + final_rows.push(rows); + } + Ok(TableWorkloadOutcome { + final_row_counts, + final_rows, + }) + } + } +} + +impl TargetPropertyAccess for RelationalDbEngine { + fn schema_plan(&self) -> &SchemaPlan { + &self.base_schema + } + + fn lookup_in_connection(&self, conn: SessionId, table: usize, id: u64) -> Result, String> { + self.lookup_base_row(conn, table, id) + } + + fn collect_rows_in_connection(&self, conn: SessionId, table: usize) -> Result, String> { + self.collect_rows_in_connection(conn, table) + } + + fn collect_rows_for_table(&self, table: usize) -> Result, String> { + let table_id = self.table_id_for_index(table)?; + self.collect_rows_by_id(table_id) + } + + fn count_rows(&self, table: usize) -> Result { + let table_id = self.table_id_for_index(table)?; + self.with_fresh_read_tx(|db, tx| { + Ok(db + .iter(tx, table_id) + .map_err(|err| format!("count rows failed: {err}"))? + .count()) + }) + } + + fn count_by_col_eq(&self, table: usize, col: u16, value: &AlgebraicValue) -> Result { + let table_id = self.table_id_for_index(table)?; + self.with_fresh_read_tx(|db, tx| { + Ok(db + .iter_by_col_eq(tx, table_id, col, value) + .map_err(|err| format!("count by col eq failed: {err}"))? + .count()) + }) + } + + fn range_scan( + &self, + table: usize, + cols: &[u16], + lower: Bound, + upper: Bound, + ) -> Result, String> { + let table_id = self.table_id_for_index(table)?; + let cols_list = cols.iter().copied().collect::(); + self.with_fresh_read_tx(|db, tx| { + let mut rows = db + .iter_by_col_range(tx, table_id, cols_list, (lower, upper)) + .map_err(|err| format!("range scan failed: {err}"))? + .map(|row_ref| SimRow::from_product_value(row_ref.to_product_value())) + .collect::>(); + rows.sort_by_key(|row| row.id().unwrap_or_default()); + Ok(rows) + }) + } +} + +fn is_unique_constraint_violation(err: &DBError) -> bool { + err.to_string().contains("Unique") || err.to_string().contains("unique") +} + +fn is_write_conflict_error(err: &str) -> bool { + err.contains("WriteConflict") || err.contains("write conflict") || err.contains("Serialization failure") +} diff --git a/crates/dst/src/workload/mod.rs b/crates/dst/src/workload/mod.rs new file mode 100644 index 00000000000..faf3c04b5f2 --- /dev/null +++ b/crates/dst/src/workload/mod.rs @@ -0,0 +1,4 @@ +//! Shared workload generators reused by multiple DST targets. + +pub mod table_ops; +pub(crate) mod strategy; diff --git a/crates/dst/src/workload/strategy.rs b/crates/dst/src/workload/strategy.rs new file mode 100644 index 00000000000..6c70ebb9e94 --- /dev/null +++ b/crates/dst/src/workload/strategy.rs @@ -0,0 +1,112 @@ +//! Small proptest-inspired strategy primitives for deterministic DST generation. +//! +//! This is intentionally minimal: we keep DST's streaming execution model and +//! use strategies only for typed, composable input generation. + +use crate::sim::Rng; + +/// Typed strategy that can sample values from the shared deterministic RNG. +pub(crate) trait Strategy: Sized { + fn sample(&self, rng: &Rng) -> T; +} + +/// Picks a value in `[0, upper)`. +#[derive(Clone, Copy, Debug)] +pub(crate) struct Index { + upper: usize, +} + +impl Index { + pub(crate) fn new(upper: usize) -> Self { + assert!(upper > 0, "index upper bound must be non-zero"); + Self { upper } + } +} + +impl Strategy for Index { + fn sample(&self, rng: &Rng) -> usize { + rng.index(self.upper) + } +} + +/// Bernoulli-style strategy from an integer percentage in `[0, 100]`. +#[derive(Clone, Copy, Debug)] +pub(crate) struct Percent { + percent: usize, +} + +impl Percent { + pub(crate) fn new(percent: usize) -> Self { + assert!(percent <= 100, "percent must be in 0..=100, got {percent}"); + Self { percent } + } +} + +impl Strategy for Percent { + fn sample(&self, rng: &Rng) -> bool { + Index::new(100).sample(rng) < self.percent + } +} + +/// Weighted discrete choice over cloneable values. +#[derive(Clone, Debug)] +pub(crate) struct Weighted { + options: Vec<(usize, T)>, + total_weight: usize, +} + +impl Weighted { + pub(crate) fn new(options: Vec<(usize, T)>) -> Self { + let total_weight = options.iter().map(|(weight, _)| *weight).sum(); + assert!(total_weight > 0, "weighted strategy requires positive total weight"); + Self { options, total_weight } + } +} + +impl Strategy for Weighted { + fn sample(&self, rng: &Rng) -> T { + let mut pick = Index::new(self.total_weight).sample(rng); + for (weight, value) in &self.options { + if pick < *weight { + return value.clone(); + } + pick -= *weight; + } + self.options + .last() + .map(|(_, value)| value.clone()) + .expect("weighted strategy has at least one option") + } +} + +#[cfg(test)] +mod tests { + use crate::sim::Rng; + + use super::{Index, Percent, Strategy, Weighted}; + + #[test] + fn weighted_is_deterministic_for_seed() { + let strategy = Weighted::new(vec![(1, 10usize), (2, 20usize), (3, 30usize)]); + let rng_a = Rng::new(7); + let rng_b = Rng::new(7); + let a = (0..16).map(|_| strategy.sample(&rng_a)).collect::>(); + let b = (0..16).map(|_| strategy.sample(&rng_b)).collect::>(); + assert_eq!(a, b); + } + + #[test] + fn index_strategy_respects_bounds() { + let rng = Rng::new(123); + for _ in 0..64 { + let idx = Index::new(5).sample(&rng); + assert!(idx < 5); + } + } + + #[test] + #[should_panic(expected = "percent must be in 0..=100")] + fn percent_rejects_out_of_range_values() { + let _ = Percent::new(101); + } +} diff --git a/crates/dst/src/workload/table_ops/generation.rs b/crates/dst/src/workload/table_ops/generation.rs new file mode 100644 index 00000000000..b6050fd8e18 --- /dev/null +++ b/crates/dst/src/workload/table_ops/generation.rs @@ -0,0 +1,288 @@ +use std::collections::VecDeque; + +use crate::{ + client::SessionId, + core::WorkloadSource, + schema::{ColumnPlan, SchemaPlan, TablePlan}, + sim::{fork_seed, Rng}, + workload::strategy::{Index, Percent, Strategy}, +}; + +use super::{ + model::GenerationModel, + strategies::{ConnectionChoice, TableChoice, TxControlAction, TxControlChoice}, + TableScenario, TableWorkloadInteraction, +}; + +/// Streaming planner for table-oriented workloads. +/// +/// The stream keeps only generator state plus a small pending queue, so long +/// duration runs do not need to materialize the full interaction list in +/// memory up front. +#[derive(Clone, Debug)] +pub struct TableWorkloadSource { + rng: Rng, + scenario: S, + model: GenerationModel, + num_connections: usize, + target_interactions: usize, + emitted: usize, + finalize_conn: usize, + pending: VecDeque, + finished: bool, +} + +/// Narrow helper passed to scenario code so scenario-specific planning can +/// inspect the current model and enqueue interactions without owning the whole +/// stream state machine. +pub struct ScenarioPlanner<'a> { + rng: &'a Rng, + model: &'a mut GenerationModel, + pending: &'a mut VecDeque, +} + +impl<'a> ScenarioPlanner<'a> { + pub fn choose_index(&mut self, len: usize) -> usize { + Index::new(len).sample(self.rng) + } + + pub fn choose_table(&mut self) -> usize { + TableChoice { + table_count: self.model.schema.tables.len(), + } + .sample(self.rng) + } + + pub fn roll_percent(&mut self, percent: usize) -> bool { + Percent::new(percent).sample(self.rng) + } + + pub fn active_writer(&self) -> Option { + self.model.active_writer() + } + + pub fn has_read_tx(&self, conn: SessionId) -> bool { + self.model.has_read_tx(conn) + } + + pub fn any_read_tx(&self) -> bool { + self.model.any_read_tx() + } + + pub fn begin_read_tx(&mut self, conn: SessionId) { + self.model.begin_read_tx(conn); + } + + pub fn release_read_tx(&mut self, conn: SessionId) { + self.model.release_read_tx(conn); + } + + pub fn begin_tx(&mut self, conn: SessionId) { + self.model.begin_tx(conn); + } + + pub fn commit_tx(&mut self, conn: SessionId) { + self.model.commit(conn); + } + + pub fn rollback_tx(&mut self, conn: SessionId) { + self.model.rollback(conn); + } + + pub fn maybe_control_tx( + &mut self, + conn: SessionId, + begin_pct: usize, + commit_pct: usize, + rollback_pct: usize, + ) -> bool { + match (TxControlChoice { + begin_pct, + commit_pct, + rollback_pct, + }) + .sample(self.rng) + { + TxControlAction::Begin + if !self.model.connections[conn.as_index()].in_tx && !self.model.has_read_tx(conn) => + { + if self.model.active_writer().is_none() && !self.model.any_read_tx() { + self.model.begin_tx(conn); + self.pending.push_back(TableWorkloadInteraction::begin_tx(conn)); + } else { + self.pending + .push_back(TableWorkloadInteraction::begin_tx_conflict(conn)); + } + true + } + TxControlAction::Commit if self.model.connections[conn.as_index()].in_tx => { + self.model.commit(conn); + self.pending.push_back(TableWorkloadInteraction::commit_tx(conn)); + true + } + TxControlAction::Rollback if self.model.connections[conn.as_index()].in_tx => { + self.model.rollback(conn); + self.pending.push_back(TableWorkloadInteraction::rollback_tx(conn)); + true + } + _ => false, + } + } + + pub fn visible_rows(&self, conn: SessionId, table: usize) -> Vec { + self.model.visible_rows(conn, table) + } + + pub fn table_plan(&self, table: usize) -> &TablePlan { + &self.model.schema.tables[table] + } + + pub fn make_row(&mut self, table: usize) -> crate::schema::SimRow { + self.model.make_row(self.rng, table) + } + + pub fn insert(&mut self, conn: SessionId, table: usize, row: crate::schema::SimRow) { + self.model.insert(conn, table, row); + } + + pub fn batch_insert(&mut self, conn: SessionId, table: usize, rows: &[crate::schema::SimRow]) { + self.model.batch_insert(conn, table, rows); + } + + pub fn delete(&mut self, conn: SessionId, table: usize, row: crate::schema::SimRow) { + self.model.delete(conn, table, row); + } + + pub fn batch_delete(&mut self, conn: SessionId, table: usize, rows: &[crate::schema::SimRow]) { + self.model.batch_delete(conn, table, rows); + } + + pub fn add_column(&mut self, table: usize, column: ColumnPlan, default: spacetimedb_sats::AlgebraicValue) { + self.model.add_column(table, column, default); + } + + pub fn add_index(&mut self, table: usize, cols: Vec) { + self.model.add_index(table, cols); + } + + pub fn absent_row(&mut self, conn: SessionId, table: usize) -> crate::schema::SimRow { + self.model.absent_row(self.rng, conn, table) + } + + pub fn unique_key_conflict_row( + &mut self, + table: usize, + source: &crate::schema::SimRow, + ) -> Option { + self.model.unique_key_conflict_row(self.rng, table, source) + } + + pub fn push_interaction(&mut self, interaction: TableWorkloadInteraction) { + self.pending.push_back(interaction); + } +} + +impl TableWorkloadSource { + pub fn new( + seed: u64, + scenario: S, + schema: SchemaPlan, + num_connections: usize, + target_interactions: usize, + ) -> Self { + Self { + rng: Rng::new(fork_seed(seed, 17)), + scenario, + model: GenerationModel::new(&schema, num_connections, seed), + num_connections, + target_interactions, + emitted: 0, + finalize_conn: 0, + pending: VecDeque::new(), + finished: false, + } + } + + pub fn request_finish(&mut self) { + self.target_interactions = self.emitted; + } + + #[allow(dead_code)] + pub fn has_open_read_tx(&self) -> bool { + self.model.any_read_tx() + } + + #[allow(dead_code)] + pub fn has_open_write_tx(&self) -> bool { + self.model.active_writer().is_some() + } + + fn fill_pending(&mut self) { + if self.emitted >= self.target_interactions { + while self.finalize_conn < self.num_connections { + let conn = SessionId::from_index(self.finalize_conn); + self.finalize_conn += 1; + if self.model.connections[conn.as_index()].in_tx { + self.model.commit(conn); + self.pending.push_back(TableWorkloadInteraction::commit_tx(conn)); + return; + } + if self.model.has_read_tx(conn) { + self.model.release_read_tx(conn); + self.pending.push_back(TableWorkloadInteraction::release_read_tx(conn)); + return; + } + } + self.finished = true; + return; + } + + let conn = ConnectionChoice { + connection_count: self.num_connections, + } + .sample(&self.rng); + let mut planner = ScenarioPlanner { + rng: &self.rng, + model: &mut self.model, + pending: &mut self.pending, + }; + self.scenario.fill_pending(&mut planner, conn); + } +} + +impl TableWorkloadSource { + pub fn pull_next_interaction(&mut self) -> Option { + loop { + if let Some(interaction) = self.pending.pop_front() { + self.emitted += 1; + return Some(interaction); + } + + if self.finished { + return None; + } + + self.fill_pending(); + } + } +} + +impl WorkloadSource for TableWorkloadSource { + type Interaction = TableWorkloadInteraction; + + fn next_interaction(&mut self) -> Option { + self.pull_next_interaction() + } + + fn request_finish(&mut self) { + Self::request_finish(self); + } +} + +impl Iterator for TableWorkloadSource { + type Item = TableWorkloadInteraction; + + fn next(&mut self) -> Option { + self.pull_next_interaction() + } +} diff --git a/crates/dst/src/workload/table_ops/mod.rs b/crates/dst/src/workload/table_ops/mod.rs new file mode 100644 index 00000000000..facf8a92734 --- /dev/null +++ b/crates/dst/src/workload/table_ops/mod.rs @@ -0,0 +1,13 @@ +//! Shared transactional table workload used by table-oriented targets. + +mod generation; +mod model; +mod scenarios; +pub(crate) mod strategies; +mod types; + +pub(crate) use generation::TableWorkloadSource; +pub(crate) use model::{PredictedOutcome, TableOracle}; +pub use scenarios::TableScenarioId; +pub(crate) use types::{ConnectionWriteState, TableScenario}; +pub use types::{TableErrorKind, TableInteractionCase, TableOperation, TableWorkloadInteraction, TableWorkloadOutcome}; diff --git a/crates/dst/src/workload/table_ops/model.rs b/crates/dst/src/workload/table_ops/model.rs new file mode 100644 index 00000000000..f56b1db5a25 --- /dev/null +++ b/crates/dst/src/workload/table_ops/model.rs @@ -0,0 +1,709 @@ +use std::ops::Bound; + +use spacetimedb_sats::AlgebraicValue; + +use crate::{ + client::SessionId, + schema::{distinct_value_for_type, generate_value_for_type, ColumnPlan, SchemaPlan, SimRow}, + sim::{fork_seed, Rng}, +}; + +use super::{TableErrorKind, TableOperation}; + +/// Generator-side model of committed rows plus per-connection pending writes. +/// +/// This model is used only while producing interactions. It lets the planner +/// pick valid deletes, synthesize visibility checks, and enforce the +/// single-writer discipline before the real target executes anything. +#[derive(Clone, Debug)] +pub(crate) struct GenerationModel { + pub(crate) schema: SchemaPlan, + pub(crate) connections: Vec, + committed: Vec>, + next_ids: Vec, + active_writer: Option, +} + +#[derive(Clone, Debug, Default)] +pub(crate) struct PendingConnection { + pub(crate) in_tx: bool, + read_snapshot: Option>>, + staged_inserts: Vec<(usize, SimRow)>, + staged_deletes: Vec<(usize, SimRow)>, +} + +impl GenerationModel { + pub(crate) fn new(schema: &SchemaPlan, num_connections: usize, seed: u64) -> Self { + Self { + schema: schema.clone(), + connections: vec![PendingConnection::default(); num_connections], + committed: vec![Vec::new(); schema.tables.len()], + next_ids: (0..schema.tables.len()) + .map(|idx| fork_seed(seed, idx as u64 + 100)) + .collect(), + active_writer: None, + } + } + + pub(crate) fn make_row(&mut self, rng: &Rng, table: usize) -> SimRow { + let table_plan = &self.schema.tables[table]; + let id = self.next_ids[table]; + self.next_ids[table] = self.next_ids[table].wrapping_add(1).max(1); + let mut values = vec![AlgebraicValue::U64(id)]; + for (idx, col) in table_plan.columns.iter().enumerate().skip(1) { + values.push(generate_value_for_type(rng, &col.ty, idx)); + } + SimRow { values } + } + + pub(crate) fn visible_rows(&self, conn: SessionId, table: usize) -> Vec { + let conn_idx = conn.as_index(); + if let Some(snapshot) = &self.connections[conn_idx].read_snapshot { + return snapshot[table].clone(); + } + let mut rows = self.committed[table].clone(); + let pending = &self.connections[conn_idx]; + for (pending_table, row) in &pending.staged_deletes { + if *pending_table == table { + rows.retain(|candidate| candidate != row); + } + } + for (pending_table, row) in &pending.staged_inserts { + if *pending_table == table { + rows.push(row.clone()); + } + } + rows + } + + pub(crate) fn absent_row(&mut self, rng: &Rng, conn: SessionId, table: usize) -> SimRow { + let mut row = self.make_row(rng, table); + while self.visible_rows(conn, table).iter().any(|candidate| candidate == &row) { + row = self.make_row(rng, table); + } + row + } + + pub(crate) fn unique_key_conflict_row(&self, rng: &Rng, table: usize, source: &SimRow) -> Option { + let table_plan = &self.schema.tables[table]; + let value_count = source.values.len().min(table_plan.columns.len()); + if value_count <= 1 { + return None; + } + + let col_idx = 1 + rng.index(value_count - 1); + let mut row = source.clone(); + row.values[col_idx] = distinct_value_for_type(&table_plan.columns[col_idx].ty, &row.values[col_idx]); + Some(row) + } + + pub(crate) fn active_writer(&self) -> Option { + self.active_writer + } + + pub(crate) fn has_read_tx(&self, conn: SessionId) -> bool { + self.connections[conn.as_index()].read_snapshot.is_some() + } + + pub(crate) fn any_read_tx(&self) -> bool { + self.connections + .iter() + .any(|connection| connection.read_snapshot.is_some()) + } + + pub(crate) fn begin_read_tx(&mut self, conn: SessionId) { + let pending = &mut self.connections[conn.as_index()]; + assert!(!pending.in_tx, "connection already has write transaction"); + assert!( + pending.read_snapshot.is_none(), + "connection already has read transaction" + ); + pending.read_snapshot = Some(self.committed.clone()); + } + + pub(crate) fn release_read_tx(&mut self, conn: SessionId) { + assert!( + self.connections[conn.as_index()].read_snapshot.take().is_some(), + "connection has no read transaction" + ); + } + + pub(crate) fn begin_tx(&mut self, conn: SessionId) { + assert!(self.active_writer.is_none(), "single writer already active"); + let pending = &mut self.connections[conn.as_index()]; + assert!(!pending.in_tx, "connection already in transaction"); + assert!( + pending.read_snapshot.is_none(), + "connection already has read transaction" + ); + pending.in_tx = true; + self.active_writer = Some(conn); + } + + pub(crate) fn insert(&mut self, conn: SessionId, table: usize, row: SimRow) { + let pending = &mut self.connections[conn.as_index()]; + if pending.in_tx { + pending.staged_inserts.push((table, row)); + } else { + self.committed[table].push(row); + } + } + + pub(crate) fn batch_insert(&mut self, conn: SessionId, table: usize, rows: &[SimRow]) { + for row in rows { + self.insert(conn, table, row.clone()); + } + } + + pub(crate) fn delete(&mut self, conn: SessionId, table: usize, row: SimRow) { + let pending = &mut self.connections[conn.as_index()]; + if pending.in_tx { + pending + .staged_inserts + .retain(|(pending_table, candidate)| !(*pending_table == table && *candidate == row)); + pending.staged_deletes.push((table, row)); + } else { + self.committed[table].retain(|candidate| *candidate != row); + } + } + + pub(crate) fn batch_delete(&mut self, conn: SessionId, table: usize, rows: &[SimRow]) { + for row in rows { + self.delete(conn, table, row.clone()); + } + } + + pub(crate) fn commit(&mut self, conn: SessionId) { + let pending = &mut self.connections[conn.as_index()]; + let inserts = std::mem::take(&mut pending.staged_inserts); + let deletes = std::mem::take(&mut pending.staged_deletes); + pending.in_tx = false; + self.active_writer = None; + + for (table, row) in &deletes { + self.committed[*table].retain(|candidate| candidate != row); + } + for (table, row) in &inserts { + self.committed[*table].push(row.clone()); + } + } + + pub(crate) fn rollback(&mut self, conn: SessionId) { + let pending = &mut self.connections[conn.as_index()]; + pending.staged_inserts.clear(); + pending.staged_deletes.clear(); + pending.in_tx = false; + self.active_writer = None; + } + + pub(crate) fn add_column(&mut self, table: usize, column: ColumnPlan, default: AlgebraicValue) { + self.schema.tables[table].columns.push(column); + for row in &mut self.committed[table] { + row.values.push(default.clone()); + } + for connection in &mut self.connections { + for (pending_table, row) in connection + .staged_inserts + .iter_mut() + .chain(connection.staged_deletes.iter_mut()) + { + if *pending_table == table { + row.values.push(default.clone()); + } + } + if let Some(snapshot) = &mut connection.read_snapshot { + for row in &mut snapshot[table] { + row.values.push(default.clone()); + } + } + } + } + + pub(crate) fn add_index(&mut self, table: usize, cols: Vec) { + let indexes = &mut self.schema.tables[table].extra_indexes; + if !indexes.contains(&cols) { + indexes.push(cols); + } + } +} + +/// Replay model used as the oracle for table workload properties. +/// +/// Target property runtimes apply every table interaction here in parallel with +/// real target execution, then compare the collected target outcome against this +/// model at the end of the run. +#[derive(Clone, Debug)] +pub struct TableOracle { + committed: Vec>, + connections: Vec, + active_writer: Option, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum PredictedOutcome { + Applied, + NoMutation { + subject: Option<(SessionId, usize)>, + }, + Error { + kind: TableErrorKind, + subject: Option<(SessionId, usize)>, + }, +} + +#[derive(Clone, Debug, Default)] +struct ExpectedConnection { + in_tx: bool, + read_snapshot: Option>>, + staged_inserts: Vec<(usize, SimRow)>, + staged_deletes: Vec<(usize, SimRow)>, +} + +impl TableOracle { + pub fn new(table_count: usize, connection_count: usize) -> Self { + Self { + committed: vec![Vec::new(); table_count], + connections: vec![ExpectedConnection::default(); connection_count], + active_writer: None, + } + } + + pub fn predict(&self, op: &TableOperation) -> Result { + match op { + TableOperation::BeginTx { conn } => { + self.ensure_connection(*conn)?; + if self.connections[conn.as_index()].read_snapshot.is_some() { + return Err(format!("connection {conn} cannot begin write tx with open read tx")); + } + if self.connections[conn.as_index()].in_tx { + return Err(format!("connection {conn} already has open write tx")); + } + if self.active_writer.is_some() + || self + .connections + .iter() + .any(|connection| connection.read_snapshot.is_some()) + { + return Ok(PredictedOutcome::Error { + kind: TableErrorKind::WriteConflict, + subject: None, + }); + } + Ok(PredictedOutcome::Applied) + } + TableOperation::BeginReadTx { conn } => { + self.ensure_connection(*conn)?; + let state = &self.connections[conn.as_index()]; + if state.in_tx || state.read_snapshot.is_some() { + return Err(format!("connection {conn} cannot begin read tx in current state")); + } + Ok(PredictedOutcome::Applied) + } + TableOperation::ReleaseReadTx { conn } => { + self.ensure_connection(*conn)?; + if self.connections[conn.as_index()].read_snapshot.is_none() { + return Err(format!("connection {conn} has no read tx to release")); + } + Ok(PredictedOutcome::Applied) + } + TableOperation::CommitTx { conn } | TableOperation::RollbackTx { conn } => { + self.ensure_connection(*conn)?; + if self.active_writer != Some(*conn) || !self.connections[conn.as_index()].in_tx { + return Err(format!("connection {conn} does not own an open write tx")); + } + Ok(PredictedOutcome::Applied) + } + TableOperation::InsertRows { conn, table, rows } => self.predict_insert_rows(*conn, *table, rows), + TableOperation::DeleteRows { conn, table, rows } => self.predict_delete_rows(*conn, *table, rows), + TableOperation::AddColumn { .. } | TableOperation::AddIndex { .. } => Ok(PredictedOutcome::Applied), + TableOperation::PointLookup { .. } + | TableOperation::PredicateCount { .. } + | TableOperation::RangeScan { .. } + | TableOperation::FullScan { .. } => Ok(PredictedOutcome::NoMutation { subject: None }), + } + } + + pub fn apply(&mut self, op: &TableOperation) { + match op { + TableOperation::BeginTx { conn } => { + assert!( + self.active_writer.is_none(), + "multiple concurrent writers in table oracle" + ); + self.connections[conn.as_index()].in_tx = true; + self.active_writer = Some(*conn); + } + TableOperation::BeginReadTx { conn } => { + let state = &mut self.connections[conn.as_index()]; + assert!(!state.in_tx, "read tx started while write tx is open"); + assert!(state.read_snapshot.is_none(), "nested read tx in table oracle"); + state.read_snapshot = Some(self.committed.clone()); + } + TableOperation::ReleaseReadTx { conn } => { + assert!( + self.connections[conn.as_index()].read_snapshot.take().is_some(), + "release read tx without open read tx" + ); + } + TableOperation::CommitTx { conn } => { + assert_eq!(self.active_writer, Some(*conn), "commit by non-owner in table oracle"); + let state = &mut self.connections[conn.as_index()]; + for (table, row) in state.staged_deletes.drain(..) { + self.committed[table].retain(|candidate| *candidate != row); + } + for (table, row) in state.staged_inserts.drain(..) { + self.committed[table].push(row); + } + state.in_tx = false; + self.active_writer = None; + } + TableOperation::RollbackTx { conn } => { + assert_eq!(self.active_writer, Some(*conn), "rollback by non-owner in table oracle"); + let state = &mut self.connections[conn.as_index()]; + state.staged_inserts.clear(); + state.staged_deletes.clear(); + state.in_tx = false; + self.active_writer = None; + } + TableOperation::InsertRows { conn, table, rows } => self.insert_rows(*conn, *table, rows), + TableOperation::DeleteRows { conn, table, rows } => self.delete_rows(*conn, *table, rows), + TableOperation::AddColumn { + table, + column: _, + default, + .. + } => { + self.add_column(*table, default.clone()); + } + TableOperation::AddIndex { .. } => {} + TableOperation::PointLookup { .. } + | TableOperation::PredicateCount { .. } + | TableOperation::RangeScan { .. } + | TableOperation::FullScan { .. } => {} + } + } + + fn predict_insert_rows(&self, conn: SessionId, table: usize, rows: &[SimRow]) -> Result { + if let Some(outcome) = self.predict_write_access(conn, table)? { + return Ok(outcome); + } + + let mut visible = self.visible_rows(conn, table); + let mut mutates = false; + for row in rows { + let Some(id) = row.id() else { + return Err(format!("insert row for table {table} is missing primary id: {row:?}")); + }; + match visible.iter().find(|candidate| candidate.id() == Some(id)) { + Some(existing) if existing == row => {} + Some(_) => { + return Ok(PredictedOutcome::Error { + kind: TableErrorKind::UniqueConstraintViolation, + subject: Some((conn, table)), + }); + } + None => { + mutates = true; + visible.push(row.clone()); + } + } + } + + if mutates { + Ok(PredictedOutcome::Applied) + } else { + Ok(PredictedOutcome::NoMutation { + subject: Some((conn, table)), + }) + } + } + + fn predict_delete_rows(&self, conn: SessionId, table: usize, rows: &[SimRow]) -> Result { + if let Some(outcome) = self.predict_write_access(conn, table)? { + return Ok(outcome); + } + + let mut visible = self.visible_rows(conn, table); + for row in rows { + let Some(idx) = visible.iter().position(|candidate| candidate == row) else { + return Ok(PredictedOutcome::Error { + kind: TableErrorKind::MissingRow, + subject: Some((conn, table)), + }); + }; + visible.remove(idx); + } + + Ok(PredictedOutcome::Applied) + } + + fn predict_write_access(&self, conn: SessionId, table: usize) -> Result, String> { + self.ensure_connection(conn)?; + self.ensure_table(table)?; + if self.connections[conn.as_index()].read_snapshot.is_some() { + return Err(format!("connection {conn} cannot write while read tx is open")); + } + if let Some(owner) = self.active_writer + && owner != conn + { + return Ok(Some(PredictedOutcome::Error { + kind: TableErrorKind::WriteConflict, + subject: None, + })); + } + Ok(None) + } + + fn ensure_connection(&self, conn: SessionId) -> Result<(), String> { + self.connections + .get(conn.as_index()) + .map(|_| ()) + .ok_or_else(|| format!("connection {conn} out of range")) + } + + fn ensure_table(&self, table: usize) -> Result<(), String> { + self.committed + .get(table) + .map(|_| ()) + .ok_or_else(|| format!("table {table} out of range")) + } + + pub fn visible_rows(&self, conn: SessionId, table: usize) -> Vec { + let conn_idx = conn.as_index(); + if let Some(snapshot) = &self.connections[conn_idx].read_snapshot { + return snapshot[table].clone(); + } + let mut rows = self.committed[table].clone(); + let pending = &self.connections[conn_idx]; + for (pending_table, row) in &pending.staged_deletes { + if *pending_table == table { + rows.retain(|candidate| candidate != row); + } + } + for (pending_table, row) in &pending.staged_inserts { + if *pending_table == table { + rows.push(row.clone()); + } + } + rows + } + + pub fn lookup_by_id(&self, conn: SessionId, table: usize, id: u64) -> Option { + self.visible_rows(conn, table) + .into_iter() + .find(|row| row.id() == Some(id)) + } + + pub fn predicate_count(&self, conn: SessionId, table: usize, col: u16, value: &AlgebraicValue) -> usize { + self.visible_rows(conn, table) + .into_iter() + .filter(|row| row.values.get(col as usize) == Some(value)) + .count() + } + + pub fn range_scan( + &self, + conn: SessionId, + table: usize, + cols: &[u16], + lower: &Bound, + upper: &Bound, + ) -> Vec { + let mut rows = self + .visible_rows(conn, table) + .into_iter() + .filter(|row| { + let key = row.project_key(cols).to_algebraic_value(); + bound_contains_lower(lower, &key) && bound_contains_upper(upper, &key) + }) + .collect::>(); + rows.sort_by(|lhs, rhs| { + lhs.project_key(cols) + .to_algebraic_value() + .cmp(&rhs.project_key(cols).to_algebraic_value()) + .then_with(|| lhs.values.cmp(&rhs.values)) + }); + rows + } + + pub fn committed_rows(mut self) -> Vec> { + for table_rows in &mut self.committed { + table_rows.sort_by_key(|row| row.id().unwrap_or_default()); + } + self.committed + } + + fn insert(&mut self, conn: SessionId, table: usize, row: SimRow) { + let state = &mut self.connections[conn.as_index()]; + if state.in_tx { + state.staged_inserts.push((table, row)); + } else { + self.committed[table].push(row); + } + } + + fn insert_rows(&mut self, conn: SessionId, table: usize, rows: &[SimRow]) { + for row in rows { + if self + .visible_rows(conn, table) + .into_iter() + .any(|candidate| candidate == *row) + { + continue; + } + self.insert(conn, table, row.clone()); + } + } + + fn delete(&mut self, conn: SessionId, table: usize, row: SimRow) { + let state = &mut self.connections[conn.as_index()]; + if state.in_tx { + state + .staged_inserts + .retain(|(pending_table, candidate)| !(*pending_table == table && *candidate == row)); + state.staged_deletes.push((table, row)); + } else { + self.committed[table].retain(|candidate| *candidate != row); + } + } + + fn delete_rows(&mut self, conn: SessionId, table: usize, rows: &[SimRow]) { + for row in rows { + self.delete(conn, table, row.clone()); + } + } + + fn add_column(&mut self, table: usize, default: AlgebraicValue) { + for row in &mut self.committed[table] { + row.values.push(default.clone()); + } + for connection in &mut self.connections { + for (pending_table, row) in connection + .staged_inserts + .iter_mut() + .chain(connection.staged_deletes.iter_mut()) + { + if *pending_table == table { + row.values.push(default.clone()); + } + } + if let Some(snapshot) = &mut connection.read_snapshot { + for row in &mut snapshot[table] { + row.values.push(default.clone()); + } + } + } + } +} + +fn bound_contains_lower(bound: &Bound, key: &AlgebraicValue) -> bool { + match bound { + Bound::Included(value) => key >= value, + Bound::Excluded(value) => key > value, + Bound::Unbounded => true, + } +} + +fn bound_contains_upper(bound: &Bound, key: &AlgebraicValue) -> bool { + match bound { + Bound::Included(value) => key <= value, + Bound::Excluded(value) => key < value, + Bound::Unbounded => true, + } +} + +#[cfg(test)] +mod tests { + use spacetimedb_sats::AlgebraicValue; + + use crate::{client::SessionId, schema::SimRow}; + + use super::{PredictedOutcome, TableErrorKind, TableOperation, TableOracle}; + + fn row(id: u64) -> SimRow { + SimRow { + values: vec![AlgebraicValue::U64(id)], + } + } + + #[test] + fn write_conflict_prediction_does_not_request_blocking_visibility_check() { + let owner = SessionId::from_index(0); + let contender = SessionId::from_index(1); + let mut oracle = TableOracle::new(1, 2); + oracle.apply(&TableOperation::BeginTx { conn: owner }); + + let prediction = oracle + .predict(&TableOperation::InsertRows { + conn: contender, + table: 0, + rows: vec![row(1)], + }) + .unwrap(); + + assert_eq!( + prediction, + PredictedOutcome::Error { + kind: TableErrorKind::WriteConflict, + subject: None, + } + ); + } + + #[test] + fn exact_duplicate_insert_is_predicted_as_no_mutation() { + let conn = SessionId::from_index(0); + let mut oracle = TableOracle::new(1, 1); + oracle.apply(&TableOperation::InsertRows { + conn, + table: 0, + rows: vec![row(1)], + }); + + let prediction = oracle + .predict(&TableOperation::InsertRows { + conn, + table: 0, + rows: vec![row(1)], + }) + .unwrap(); + + assert_eq!( + prediction, + PredictedOutcome::NoMutation { + subject: Some((conn, 0)), + } + ); + } + + #[test] + fn same_id_different_row_is_predicted_as_unique_constraint_violation() { + let conn = SessionId::from_index(0); + let mut oracle = TableOracle::new(1, 1); + oracle.apply(&TableOperation::InsertRows { + conn, + table: 0, + rows: vec![SimRow { + values: vec![AlgebraicValue::U64(1), AlgebraicValue::U64(10)], + }], + }); + + let prediction = oracle + .predict(&TableOperation::InsertRows { + conn, + table: 0, + rows: vec![SimRow { + values: vec![AlgebraicValue::U64(1), AlgebraicValue::U64(11)], + }], + }) + .unwrap(); + + assert_eq!( + prediction, + PredictedOutcome::Error { + kind: TableErrorKind::UniqueConstraintViolation, + subject: Some((conn, 0)), + } + ); + } +} diff --git a/crates/dst/src/workload/table_ops/scenarios/mod.rs b/crates/dst/src/workload/table_ops/scenarios/mod.rs new file mode 100644 index 00000000000..4619473dc36 --- /dev/null +++ b/crates/dst/src/workload/table_ops/scenarios/mod.rs @@ -0,0 +1,48 @@ +mod random_crud; + +use crate::{client::SessionId, schema::SchemaPlan, sim::Rng}; + +use super::{generation::ScenarioPlanner, TableScenario, TableWorkloadOutcome}; + +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub(crate) struct RandomCrudScenario; + +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub enum TableScenarioId { + #[default] + RandomCrud, +} + +impl TableScenario for RandomCrudScenario { + fn generate_schema(&self, rng: &Rng) -> SchemaPlan { + random_crud::generate_schema(rng) + } + + fn validate_outcome(&self, schema: &SchemaPlan, outcome: &TableWorkloadOutcome) -> anyhow::Result<()> { + random_crud::validate_outcome(schema, outcome) + } + + fn fill_pending(&self, planner: &mut ScenarioPlanner<'_>, conn: SessionId) { + random_crud::fill_pending(planner, conn); + } +} + +impl TableScenario for TableScenarioId { + fn generate_schema(&self, rng: &Rng) -> SchemaPlan { + match self { + Self::RandomCrud => RandomCrudScenario.generate_schema(rng), + } + } + + fn validate_outcome(&self, schema: &SchemaPlan, outcome: &TableWorkloadOutcome) -> anyhow::Result<()> { + match self { + Self::RandomCrud => RandomCrudScenario.validate_outcome(schema, outcome), + } + } + + fn fill_pending(&self, planner: &mut ScenarioPlanner<'_>, conn: SessionId) { + match self { + Self::RandomCrud => RandomCrudScenario.fill_pending(planner, conn), + } + } +} diff --git a/crates/dst/src/workload/table_ops/scenarios/random_crud.rs b/crates/dst/src/workload/table_ops/scenarios/random_crud.rs new file mode 100644 index 00000000000..5864592e0e6 --- /dev/null +++ b/crates/dst/src/workload/table_ops/scenarios/random_crud.rs @@ -0,0 +1,457 @@ +use std::ops::Bound; + +use spacetimedb_sats::AlgebraicType; + +use crate::{ + client::SessionId, + schema::{default_value_for_type, generate_supported_type, ColumnPlan, SchemaPlan, SimRow, TablePlan}, + sim::Rng, + workload::strategy::{Index, Percent, Strategy}, +}; + +use super::super::{generation::ScenarioPlanner, TableInteractionCase, TableWorkloadInteraction, TableWorkloadOutcome}; + +#[derive(Clone, Copy)] +struct TableWorkloadProfile { + min_tables: usize, + table_count_choices: usize, + min_extra_cols: usize, + extra_col_choices: usize, + preferred_range_cols: usize, + prefer_range_compatible_pct: usize, + prefer_u64_pct: usize, + single_index_pct: usize, + composite2_index_pct: usize, + composite3_index_pct: usize, + insert_pct: usize, + begin_tx_pct: usize, + commit_tx_pct: usize, + rollback_tx_pct: usize, + begin_read_tx_pct: usize, + release_read_tx_pct: usize, + empty_tx_pct: usize, + exact_duplicate_insert_pct: usize, + unique_key_conflict_insert_pct: usize, + add_column_pct: usize, + add_index_pct: usize, +} + +const RANDOM_CRUD_PROFILE: TableWorkloadProfile = TableWorkloadProfile { + min_tables: 2, + table_count_choices: 3, + min_extra_cols: 1, + extra_col_choices: 4, + preferred_range_cols: 2, + prefer_range_compatible_pct: 65, + prefer_u64_pct: 75, + single_index_pct: 70, + composite2_index_pct: 65, + composite3_index_pct: 30, + insert_pct: 65, + begin_tx_pct: 20, + commit_tx_pct: 15, + rollback_tx_pct: 10, + begin_read_tx_pct: 4, + release_read_tx_pct: 35, + empty_tx_pct: 2, + exact_duplicate_insert_pct: 4, + unique_key_conflict_insert_pct: 4, + add_column_pct: 1, + add_index_pct: 2, +}; + +pub fn generate_schema(rng: &Rng) -> SchemaPlan { + generate_schema_with_profile(rng, RANDOM_CRUD_PROFILE) +} + +fn generate_schema_with_profile(rng: &Rng, profile: TableWorkloadProfile) -> SchemaPlan { + let table_count = profile.min_tables + Index::new(profile.table_count_choices).sample(rng); + let mut tables = Vec::with_capacity(table_count); + + for table_idx in 0..table_count { + let extra_cols = profile.min_extra_cols + Index::new(profile.extra_col_choices).sample(rng); + let mut columns = vec![ColumnPlan { + name: "id".into(), + ty: AlgebraicType::U64, + }]; + for col_idx in 0..extra_cols { + let ty = if col_idx < profile.preferred_range_cols + && Percent::new(profile.prefer_range_compatible_pct).sample(rng) + { + if Percent::new(profile.prefer_u64_pct).sample(rng) { + AlgebraicType::U64 + } else { + AlgebraicType::Bool + } + } else { + generate_supported_type(rng) + }; + columns.push(ColumnPlan { + name: format!("c{table_idx}_{col_idx}"), + ty, + }); + } + let mut extra_indexes = Vec::new(); + let non_primary_range_cols = columns + .iter() + .enumerate() + .skip(1) + .filter(|(_, col)| is_range_compatible(&col.ty)) + .map(|(idx, _)| idx as u16) + .collect::>(); + if let Some(&col) = non_primary_range_cols.first() + && Percent::new(profile.single_index_pct).sample(rng) + { + extra_indexes.push(vec![col]); + } + if non_primary_range_cols.len() >= 2 && Percent::new(profile.composite2_index_pct).sample(rng) { + extra_indexes.push(non_primary_range_cols[..2].to_vec()); + } + if non_primary_range_cols.len() >= 3 && Percent::new(profile.composite3_index_pct).sample(rng) { + extra_indexes.push(non_primary_range_cols[..3].to_vec()); + } + extra_indexes.sort(); + extra_indexes.dedup(); + tables.push(TablePlan { + name: format!("dst_table_{table_idx}_{}", rng.next_u64() % 10_000), + columns, + extra_indexes, + }); + } + + SchemaPlan { tables } +} + +pub fn validate_outcome(_schema: &SchemaPlan, _outcome: &TableWorkloadOutcome) -> anyhow::Result<()> { + Ok(()) +} + +pub fn fill_pending(planner: &mut ScenarioPlanner<'_>, conn: SessionId) { + fill_pending_with_profile(planner, conn, RANDOM_CRUD_PROFILE); +} + +fn fill_pending_with_profile(planner: &mut ScenarioPlanner<'_>, conn: SessionId, profile: TableWorkloadProfile) { + if planner.has_read_tx(conn) { + let table = planner.choose_table(); + let visible_rows = planner.visible_rows(conn, table); + if planner.roll_percent(profile.release_read_tx_pct) { + planner.release_read_tx(conn); + planner.push_interaction(TableWorkloadInteraction::release_read_tx(conn)); + } else if !emit_query(planner, conn, table, &visible_rows) { + planner.push_interaction(TableWorkloadInteraction::full_scan(conn, table)); + } + return; + } + + if planner.active_writer().is_none() { + if planner.roll_percent(profile.empty_tx_pct) { + let rollback = planner.roll_percent(50); + planner.begin_tx(conn); + planner.push_interaction(TableWorkloadInteraction::begin_tx(conn)); + if rollback { + planner.rollback_tx(conn); + planner.push_interaction(TableWorkloadInteraction::rollback_tx(conn)); + } else { + planner.commit_tx(conn); + planner.push_interaction(TableWorkloadInteraction::commit_tx(conn)); + } + return; + } + + if planner.roll_percent(profile.begin_read_tx_pct) { + planner.begin_read_tx(conn); + planner.push_interaction(TableWorkloadInteraction::begin_read_tx(conn)); + let table = planner.choose_table(); + let visible_rows = planner.visible_rows(conn, table); + if !emit_query(planner, conn, table, &visible_rows) { + planner.push_interaction(TableWorkloadInteraction::full_scan(conn, table)); + } + return; + } + } + + if planner.maybe_control_tx( + conn, + profile.begin_tx_pct, + profile.commit_tx_pct, + profile.rollback_tx_pct, + ) { + return; + } + + let table = planner.choose_table(); + let visible_rows = planner.visible_rows(conn, table); + if planner.active_writer().is_none() + && !planner.any_read_tx() + && !visible_rows.is_empty() + && planner.roll_percent(profile.add_column_pct) + && emit_add_column(planner, conn, table) + { + return; + } + if planner.active_writer().is_none() + && !planner.any_read_tx() + && visible_rows.len() >= 2 + && planner.roll_percent(profile.add_index_pct) + && emit_add_index(planner, conn, table, &visible_rows) + { + return; + } + if emit_query(planner, conn, table, &visible_rows) { + return; + } + if planner.roll_percent(5) { + let row = planner.absent_row(conn, table); + planner.push_interaction(TableWorkloadInteraction::delete_missing(conn, table, row)); + return; + } + let choose_insert = visible_rows.is_empty() || planner.roll_percent(profile.insert_pct); + if choose_insert { + if planner.roll_percent(10) { + let count = 2 + planner.choose_index(3); + let rows = (0..count).map(|_| planner.make_row(table)).collect::>(); + planner.batch_insert(conn, table, &rows); + planner.push_interaction(TableWorkloadInteraction::batch_insert(conn, table, rows)); + return; + } + let row = planner.make_row(table); + planner.insert(conn, table, row.clone()); + planner.push_interaction(TableWorkloadInteraction::insert(conn, table, row)); + return; + } + + if planner.roll_percent(profile.exact_duplicate_insert_pct) { + let row = visible_rows[planner.choose_index(visible_rows.len())].clone(); + planner.push_interaction(TableWorkloadInteraction::exact_duplicate_insert(conn, table, row)); + return; + } + if planner.roll_percent(profile.unique_key_conflict_insert_pct) + && emit_unique_key_conflict_insert(planner, conn, table, &visible_rows) + { + return; + } + + if visible_rows.len() >= 2 && planner.roll_percent(10) { + let count = 2 + planner.choose_index(visible_rows.len().min(3) - 1); + let mut candidates = visible_rows.clone(); + let mut rows = Vec::with_capacity(count); + for _ in 0..count { + let idx = planner.choose_index(candidates.len()); + rows.push(candidates.remove(idx)); + } + planner.batch_delete(conn, table, &rows); + planner.push_interaction(TableWorkloadInteraction::batch_delete(conn, table, rows)); + return; + } + if planner.roll_percent(6) { + let row = visible_rows[planner.choose_index(visible_rows.len())].clone(); + planner.delete(conn, table, row.clone()); + planner.push_interaction(TableWorkloadInteraction::delete_with_case( + conn, + table, + row.clone(), + TableInteractionCase::Reinsert, + )); + planner.insert(conn, table, row.clone()); + planner.push_interaction(TableWorkloadInteraction::insert(conn, table, row)); + return; + } + + let row = visible_rows[planner.choose_index(visible_rows.len())].clone(); + planner.delete(conn, table, row.clone()); + planner.push_interaction(TableWorkloadInteraction::delete(conn, table, row)); +} + +fn emit_add_column(planner: &mut ScenarioPlanner<'_>, conn: SessionId, table: usize) -> bool { + const MAX_COLUMNS_PER_TABLE: usize = 12; + let column_idx = planner.table_plan(table).columns.len(); + if column_idx >= MAX_COLUMNS_PER_TABLE { + return false; + } + let ty = match planner.choose_index(4) { + 0 => AlgebraicType::Bool, + 1 => AlgebraicType::U64, + 2 => AlgebraicType::String, + _ => generate_supported_type_for_churn(planner), + }; + let column = ColumnPlan { + name: format!("dst_added_{table}_{column_idx}"), + ty, + }; + let default = default_value_for_type(&column.ty); + planner.add_column(table, column.clone(), default.clone()); + planner.push_interaction(TableWorkloadInteraction::add_column(conn, table, column, default)); + true +} + +fn emit_add_index(planner: &mut ScenarioPlanner<'_>, conn: SessionId, table: usize, visible_rows: &[SimRow]) -> bool { + let candidates = candidate_new_indexes(planner, table); + if candidates.is_empty() { + return false; + } + let cols = candidates[planner.choose_index(candidates.len())].clone(); + planner.add_index(table, cols.clone()); + planner.push_interaction(TableWorkloadInteraction::add_index(conn, table, cols.clone())); + if let Some((lower, upper)) = inclusive_bounds_for_rows(visible_rows, &cols) { + planner.push_interaction(TableWorkloadInteraction::range_scan( + conn, + table, + cols, + Bound::Included(lower), + Bound::Included(upper), + )); + } + true +} + +fn emit_unique_key_conflict_insert( + planner: &mut ScenarioPlanner<'_>, + conn: SessionId, + table: usize, + visible_rows: &[SimRow], +) -> bool { + let source = visible_rows[planner.choose_index(visible_rows.len())].clone(); + let Some(row) = planner.unique_key_conflict_row(table, &source) else { + return false; + }; + planner.push_interaction(TableWorkloadInteraction::unique_key_conflict_insert(conn, table, row)); + true +} + +fn generate_supported_type_for_churn(planner: &mut ScenarioPlanner<'_>) -> AlgebraicType { + match planner.choose_index(6) { + 0 => AlgebraicType::I64, + 1 => AlgebraicType::U32, + 2 => AlgebraicType::I32, + 3 => AlgebraicType::U8, + 4 => AlgebraicType::I128, + _ => AlgebraicType::U128, + } +} + +fn candidate_new_indexes(planner: &ScenarioPlanner<'_>, table: usize) -> Vec> { + let table_plan = planner.table_plan(table); + let cols = table_plan + .columns + .iter() + .enumerate() + .skip(1) + .filter(|(_, column)| is_range_compatible(&column.ty)) + .map(|(idx, _)| idx as u16) + .collect::>(); + let mut candidates = Vec::new(); + for width in 1..=cols.len().min(3) { + let candidate = cols[..width].to_vec(); + if !table_plan.extra_indexes.contains(&candidate) { + candidates.push(candidate); + } + } + candidates +} + +fn inclusive_bounds_for_rows( + rows: &[SimRow], + cols: &[u16], +) -> Option<(spacetimedb_sats::AlgebraicValue, spacetimedb_sats::AlgebraicValue)> { + let mut sorted = rows.to_vec(); + sorted.sort_by(|lhs, rhs| { + lhs.project_key(cols) + .to_algebraic_value() + .cmp(&rhs.project_key(cols).to_algebraic_value()) + .then_with(|| lhs.values.cmp(&rhs.values)) + }); + let lower = sorted.first()?.project_key(cols).to_algebraic_value(); + let upper = sorted.last()?.project_key(cols).to_algebraic_value(); + Some((lower, upper)) +} + +fn emit_query( + planner: &mut ScenarioPlanner<'_>, + conn: SessionId, + table: usize, + visible_rows: &[crate::schema::SimRow], +) -> bool { + if !planner.roll_percent(25) { + return false; + } + if visible_rows.is_empty() { + planner.push_interaction(TableWorkloadInteraction::full_scan(conn, table)); + return true; + } + + match planner.choose_index(4) { + 0 => { + let row = &visible_rows[planner.choose_index(visible_rows.len())]; + if let Some(id) = row.id() { + planner.push_interaction(TableWorkloadInteraction::point_lookup(conn, table, id)); + true + } else { + false + } + } + 1 => { + let col = choose_predicate_col(planner, table); + let row = &visible_rows[planner.choose_index(visible_rows.len())]; + if let Some(value) = row.values.get(col as usize).cloned() { + planner.push_interaction(TableWorkloadInteraction::predicate_count(conn, table, col, value)); + true + } else { + false + } + } + 2 => { + let extra_indexes = planner.table_plan(table).extra_indexes.clone(); + let Some(cols) = extra_indexes + .into_iter() + .find(|cols| range_cols_supported(planner, table, cols)) + else { + planner.push_interaction(TableWorkloadInteraction::full_scan(conn, table)); + return true; + }; + let mut rows = visible_rows.to_vec(); + rows.sort_by(|lhs, rhs| { + lhs.project_key(&cols) + .to_algebraic_value() + .cmp(&rhs.project_key(&cols).to_algebraic_value()) + .then_with(|| lhs.values.cmp(&rhs.values)) + }); + let lower = rows[0].project_key(&cols).to_algebraic_value(); + let upper = rows[rows.len() - 1].project_key(&cols).to_algebraic_value(); + planner.push_interaction(TableWorkloadInteraction::range_scan( + conn, + table, + cols, + Bound::Included(lower), + Bound::Included(upper), + )); + true + } + _ => { + planner.push_interaction(TableWorkloadInteraction::full_scan(conn, table)); + true + } + } +} + +fn choose_predicate_col(planner: &mut ScenarioPlanner<'_>, table: usize) -> u16 { + let column_count = planner.table_plan(table).columns.len(); + if column_count <= 1 { + 0 + } else { + 1 + planner.choose_index(column_count - 1) as u16 + } +} + +fn range_cols_supported(planner: &ScenarioPlanner<'_>, table: usize, cols: &[u16]) -> bool { + cols.iter().all(|col| { + planner + .table_plan(table) + .columns + .get(*col as usize) + .is_some_and(|column| is_range_compatible(&column.ty)) + }) +} + +fn is_range_compatible(ty: &AlgebraicType) -> bool { + matches!(ty, AlgebraicType::U64 | AlgebraicType::Bool) +} diff --git a/crates/dst/src/workload/table_ops/strategies.rs b/crates/dst/src/workload/table_ops/strategies.rs new file mode 100644 index 00000000000..42dbc6c2ee4 --- /dev/null +++ b/crates/dst/src/workload/table_ops/strategies.rs @@ -0,0 +1,66 @@ +//! Typed strategies specific to table-style workload generation. + +use crate::{ + client::SessionId, + sim::Rng, + workload::strategy::{Index, Strategy, Weighted}, +}; + +/// Choose one logical session uniformly from the current fixed-size session pool. +#[derive(Clone, Copy, Debug)] +pub(crate) struct ConnectionChoice { + pub(crate) connection_count: usize, +} + +impl Strategy for ConnectionChoice { + fn sample(&self, rng: &Rng) -> SessionId { + SessionId::from_index(Index::new(self.connection_count).sample(rng)) + } +} + +/// Choose one table uniformly. +#[derive(Clone, Copy, Debug)] +pub(crate) struct TableChoice { + pub(crate) table_count: usize, +} + +impl Strategy for TableChoice { + fn sample(&self, rng: &Rng) -> usize { + Index::new(self.table_count).sample(rng) + } +} + +/// Weighted transaction control action. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) enum TxControlAction { + Begin, + Commit, + Rollback, + None, +} + +/// Strategy for begin/commit/rollback control flow. +#[derive(Clone, Copy, Debug)] +pub(crate) struct TxControlChoice { + pub(crate) begin_pct: usize, + pub(crate) commit_pct: usize, + pub(crate) rollback_pct: usize, +} + +impl Strategy for TxControlChoice { + fn sample(&self, rng: &Rng) -> TxControlAction { + let begin = self.begin_pct.min(100); + let commit = self.commit_pct.min(100); + let rollback = self.rollback_pct.min(100); + let reserved = begin.saturating_add(commit).saturating_add(rollback).min(100); + let none = 100usize.saturating_sub(reserved); + + Weighted::new(vec![ + (begin, TxControlAction::Begin), + (commit, TxControlAction::Commit), + (rollback, TxControlAction::Rollback), + (none, TxControlAction::None), + ]) + .sample(rng) + } +} diff --git a/crates/dst/src/workload/table_ops/types.rs b/crates/dst/src/workload/table_ops/types.rs new file mode 100644 index 00000000000..6b589b0cdaf --- /dev/null +++ b/crates/dst/src/workload/table_ops/types.rs @@ -0,0 +1,311 @@ +use std::ops::Bound; + +use spacetimedb_sats::AlgebraicValue; + +use crate::{ + client::SessionId, + schema::{ColumnPlan, SchemaPlan, SimRow}, + sim::Rng, +}; + +use super::generation::ScenarioPlanner; + +/// Scenario hook for shared table-oriented workloads. +/// +/// A scenario supplies the initial schema, scenario-specific commit-time +/// properties, and any final invariant over the collected outcome. +pub(crate) trait TableScenario: Clone { + fn generate_schema(&self, rng: &Rng) -> SchemaPlan; + fn validate_outcome(&self, schema: &SchemaPlan, outcome: &TableWorkloadOutcome) -> anyhow::Result<()>; + fn fill_pending(&self, planner: &mut ScenarioPlanner<'_>, conn: SessionId); +} + +/// One generated workload step. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct PlannedInteraction { + pub op: TableOperation, + /// Generator-side coverage/debug label. + /// + /// Correctness must not depend on this field. Properties predict expected + /// behavior from the model and `op`; this label only preserves intent in + /// summaries and failure reports. + pub case: TableInteractionCase, +} + +pub type TableWorkloadInteraction = PlannedInteraction; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum TableOperation { + /// Start an explicit write transaction on a connection. + BeginTx { conn: SessionId }, + /// Commit the connection's explicit write transaction. + CommitTx { conn: SessionId }, + /// Roll back the connection's explicit write transaction. + RollbackTx { conn: SessionId }, + /// Hold a read snapshot open while later reads observe stable state. + BeginReadTx { conn: SessionId }, + /// Release a previously opened read snapshot. + ReleaseReadTx { conn: SessionId }, + /// Insert one or more rows. + InsertRows { + conn: SessionId, + table: usize, + rows: Vec, + }, + /// Delete one or more rows. + DeleteRows { + conn: SessionId, + table: usize, + rows: Vec, + }, + /// Add a column to an existing table with a default for live rows. + AddColumn { + conn: SessionId, + table: usize, + column: ColumnPlan, + default: AlgebraicValue, + }, + /// Add a non-primary index after data exists. + AddIndex { + conn: SessionId, + table: usize, + cols: Vec, + }, + /// Query a row by primary id and compare against the model. + PointLookup { conn: SessionId, table: usize, id: u64 }, + /// Count rows by equality on one column and compare against the model. + PredicateCount { + conn: SessionId, + table: usize, + col: u16, + value: AlgebraicValue, + }, + /// Scan an indexed range and compare against model filtering. + RangeScan { + conn: SessionId, + table: usize, + cols: Vec, + lower: Bound, + upper: Bound, + }, + /// Scan all visible rows and compare against the model. + FullScan { conn: SessionId, table: usize }, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum TableErrorKind { + UniqueConstraintViolation, + MissingRow, + WriteConflict, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum TableInteractionCase { + BeginTx, + CommitTx, + RollbackTx, + BeginReadTx, + ReleaseReadTx, + BeginTxConflict, + WriteConflictInsert, + Insert, + Delete, + ExactDuplicateInsert, + UniqueKeyConflictInsert, + DeleteMissing, + BatchInsert, + BatchDelete, + Reinsert, + AddColumn, + AddIndex, + PointLookup, + PredicateCount, + RangeScan, + FullScan, +} + +impl PlannedInteraction { + pub fn new(op: TableOperation, case: TableInteractionCase) -> Self { + Self { op, case } + } + + pub fn begin_tx(conn: SessionId) -> Self { + Self::new(TableOperation::BeginTx { conn }, TableInteractionCase::BeginTx) + } + + pub fn commit_tx(conn: SessionId) -> Self { + Self::new(TableOperation::CommitTx { conn }, TableInteractionCase::CommitTx) + } + + pub fn rollback_tx(conn: SessionId) -> Self { + Self::new(TableOperation::RollbackTx { conn }, TableInteractionCase::RollbackTx) + } + + pub fn begin_read_tx(conn: SessionId) -> Self { + Self::new(TableOperation::BeginReadTx { conn }, TableInteractionCase::BeginReadTx) + } + + pub fn release_read_tx(conn: SessionId) -> Self { + Self::new( + TableOperation::ReleaseReadTx { conn }, + TableInteractionCase::ReleaseReadTx, + ) + } + + pub fn begin_tx_conflict(conn: SessionId) -> Self { + Self::new(TableOperation::BeginTx { conn }, TableInteractionCase::BeginTxConflict) + } + + pub fn write_conflict_insert(conn: SessionId, table: usize, row: SimRow) -> Self { + Self::insert_rows(conn, table, vec![row], TableInteractionCase::WriteConflictInsert) + } + + pub fn insert(conn: SessionId, table: usize, row: SimRow) -> Self { + Self::insert_with_case(conn, table, row, TableInteractionCase::Insert) + } + + pub fn insert_with_case(conn: SessionId, table: usize, row: SimRow, case: TableInteractionCase) -> Self { + Self::insert_rows(conn, table, vec![row], case) + } + + pub fn delete(conn: SessionId, table: usize, row: SimRow) -> Self { + Self::delete_with_case(conn, table, row, TableInteractionCase::Delete) + } + + pub fn delete_with_case(conn: SessionId, table: usize, row: SimRow, case: TableInteractionCase) -> Self { + Self::delete_rows(conn, table, vec![row], case) + } + + pub fn exact_duplicate_insert(conn: SessionId, table: usize, row: SimRow) -> Self { + Self::insert_with_case(conn, table, row, TableInteractionCase::ExactDuplicateInsert) + } + + pub fn unique_key_conflict_insert(conn: SessionId, table: usize, row: SimRow) -> Self { + Self::insert_with_case(conn, table, row, TableInteractionCase::UniqueKeyConflictInsert) + } + + pub fn delete_missing(conn: SessionId, table: usize, row: SimRow) -> Self { + Self::delete_with_case(conn, table, row, TableInteractionCase::DeleteMissing) + } + + pub fn batch_insert(conn: SessionId, table: usize, rows: Vec) -> Self { + Self::insert_rows(conn, table, rows, TableInteractionCase::BatchInsert) + } + + pub fn batch_delete(conn: SessionId, table: usize, rows: Vec) -> Self { + Self::delete_rows(conn, table, rows, TableInteractionCase::BatchDelete) + } + + fn insert_rows(conn: SessionId, table: usize, rows: Vec, case: TableInteractionCase) -> Self { + Self::new(TableOperation::InsertRows { conn, table, rows }, case) + } + + fn delete_rows(conn: SessionId, table: usize, rows: Vec, case: TableInteractionCase) -> Self { + Self::new(TableOperation::DeleteRows { conn, table, rows }, case) + } + + pub fn add_column(conn: SessionId, table: usize, column: ColumnPlan, default: AlgebraicValue) -> Self { + Self::new( + TableOperation::AddColumn { + conn, + table, + column, + default, + }, + TableInteractionCase::AddColumn, + ) + } + + pub fn add_index(conn: SessionId, table: usize, cols: Vec) -> Self { + Self::new( + TableOperation::AddIndex { conn, table, cols }, + TableInteractionCase::AddIndex, + ) + } + + pub fn point_lookup(conn: SessionId, table: usize, id: u64) -> Self { + Self::new( + TableOperation::PointLookup { conn, table, id }, + TableInteractionCase::PointLookup, + ) + } + + pub fn predicate_count(conn: SessionId, table: usize, col: u16, value: AlgebraicValue) -> Self { + Self::new( + TableOperation::PredicateCount { + conn, + table, + col, + value, + }, + TableInteractionCase::PredicateCount, + ) + } + + pub fn range_scan( + conn: SessionId, + table: usize, + cols: Vec, + lower: Bound, + upper: Bound, + ) -> Self { + Self::new( + TableOperation::RangeScan { + conn, + table, + cols, + lower, + upper, + }, + TableInteractionCase::RangeScan, + ) + } + + pub fn full_scan(conn: SessionId, table: usize) -> Self { + Self::new(TableOperation::FullScan { conn, table }, TableInteractionCase::FullScan) + } +} + +/// Final state gathered from a table-workload engine after execution ends. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct TableWorkloadOutcome { + /// Row count for each table in schema order. + pub final_row_counts: Vec, + /// Full committed rows for each table in schema order. + pub final_rows: Vec>, +} + +/// Per-session write transaction bookkeeping shared by locking targets. +pub(crate) struct ConnectionWriteState { + /// Open mutable transaction handle for each simulated session. + pub tx_by_connection: Vec>, + /// Session that currently owns the single-writer lock, if any. + pub active_writer: Option, +} + +impl ConnectionWriteState { + pub fn new(connection_count: usize) -> Self { + Self { + tx_by_connection: (0..connection_count).map(|_| None).collect(), + active_writer: None, + } + } + + pub fn ensure_known_connection(&self, conn: SessionId) -> Result<(), String> { + self.tx_by_connection + .get(conn.as_index()) + .map(|_| ()) + .ok_or_else(|| format!("connection {conn} out of range")) + } + + pub fn ensure_writer_owner(&self, conn: SessionId, action: &str) -> Result<(), String> { + self.ensure_known_connection(conn)?; + match self.active_writer { + Some(owner) if owner == conn => Ok(()), + Some(owner) => Err(format!( + "connection {conn} cannot {action} while connection {owner} owns lock" + )), + None => Err(format!("connection {conn} has no transaction to {action}")), + } + } +} diff --git a/crates/durability/Cargo.toml b/crates/durability/Cargo.toml index 0ea8022fcbe..4eaa3870001 100644 --- a/crates/durability/Cargo.toml +++ b/crates/durability/Cargo.toml @@ -21,6 +21,7 @@ scopeguard.workspace = true spacetimedb-commitlog.workspace = true spacetimedb-fs-utils.workspace = true spacetimedb-paths.workspace = true +spacetimedb-runtime = { workspace = true, features = ["tokio"] } spacetimedb-sats.workspace = true thiserror.workspace = true tokio.workspace = true diff --git a/crates/durability/src/imp/local.rs b/crates/durability/src/imp/local.rs index 04d46d8f634..3447e4fbf9a 100644 --- a/crates/durability/src/imp/local.rs +++ b/crates/durability/src/imp/local.rs @@ -19,11 +19,9 @@ use spacetimedb_commitlog::{ }; use spacetimedb_fs_utils::lockfile::advisory::{LockError, LockedFile}; use spacetimedb_paths::server::ReplicaDir; +use spacetimedb_runtime::{Handle, JoinHandle}; use thiserror::Error; -use tokio::{ - sync::watch, - task::{spawn_blocking, JoinHandle}, -}; +use tokio::sync::watch; use tracing::{instrument, Span}; use crate::{Close, Durability, DurableOffset, History, PreparedTx, TxOffset}; @@ -119,13 +117,13 @@ impl Local { /// /// `replica_dir` must already exist. /// - /// Background tasks are spawned onto the provided tokio runtime. + /// Background tasks are spawned onto the provided runtime. /// /// We will send a message down the `on_new_segment` channel whenever we begin a new commitlog segment. /// This is used to capture a snapshot each new segment. pub fn open( replica_dir: ReplicaDir, - rt: tokio::runtime::Handle, + rt: Handle, opts: Options, on_new_segment: Option>, ) -> Result { @@ -150,7 +148,7 @@ where R: RepoWithoutLockFile + Send + Sync + 'static, { /// Create a [`Local`] instance backed by the provided commitlog repo. - pub fn open_with_repo(repo: R, rt: tokio::runtime::Handle, opts: Options) -> Result { + pub fn open_with_repo(repo: R, rt: Handle, opts: Options) -> Result { info!("open local durability"); let clog = Arc::new(Commitlog::open_with_repo(repo, opts.commitlog)?); Self::open_inner(clog, rt, opts, None) @@ -164,7 +162,7 @@ where { fn open_inner( clog: Arc, R>>, - rt: tokio::runtime::Handle, + rt: Handle, opts: Options, lock: Option, ) -> Result { @@ -172,16 +170,13 @@ where let (queue, txdata_rx) = async_channel::bounded(queue_capacity); let queue_depth = Arc::new(AtomicU64::new(0)); let (durable_tx, durable_rx) = watch::channel(clog.max_committed_offset()); - let actor = rt.spawn( Actor { clog: clog.clone(), - durable_offset: durable_tx, queue_depth: queue_depth.clone(), - batch_capacity: opts.batch_capacity, - + runtime: rt.clone(), lock, } .run(txdata_rx), @@ -204,8 +199,8 @@ where impl Local where - T: Send + Sync + 'static, - R: Repo + Send + Sync + 'static, + T: Encode + Send + Sync + 'static, + R: RepoWithoutLockFile + Send + Sync + 'static, { /// Inspect how many transactions added via [`Self::append_tx`] are pending /// to be applied to the underlying [`Commitlog`]. @@ -246,6 +241,7 @@ where queue_depth: Arc, batch_capacity: NonZeroUsize, + runtime: Handle, #[allow(unused)] lock: Option, @@ -281,15 +277,16 @@ where let clog = self.clog.clone(); let ready_len = tx_buf.len(); self.queue_depth.fetch_sub(ready_len as u64, Relaxed); - tx_buf = spawn_blocking(move || -> io::Result>>> { - for tx in tx_buf.drain(..) { - clog.commit([tx.into_transaction()])?; - } - Ok(tx_buf) - }) - .await - .expect("commitlog write panicked") - .expect("commitlog write failed"); + let runtime = self.runtime.clone(); + tx_buf = runtime + .spawn_blocking(move || -> io::Result>>> { + for tx in tx_buf.drain(..) { + clog.commit([tx.into_transaction()])?; + } + Ok(tx_buf) + }) + .await + .expect("commitlog write failed"); if self.flush_and_sync().await.is_err() { sync_on_exit = false; break; @@ -318,21 +315,22 @@ where let clog = self.clog.clone(); let span = Span::current(); - spawn_blocking(move || { - let _span = span.enter(); - clog.flush_and_sync() - }) - .await - .expect("commitlog flush-and-sync blocking task panicked") - .inspect_err(|e| warn!("error flushing commitlog: {e:#}")) - .inspect(|maybe_offset| { - if let Some(new_offset) = maybe_offset { - trace!("synced to offset {new_offset}"); - self.durable_offset.send_modify(|val| { - val.replace(*new_offset); - }); - } - }) + let runtime = self.runtime.clone(); + runtime + .spawn_blocking(move || { + let _span = span.enter(); + clog.flush_and_sync() + }) + .await + .inspect_err(|e| warn!("error flushing commitlog: {e:#}")) + .inspect(|maybe_offset| { + if let Some(new_offset) = maybe_offset { + trace!("synced to offset {new_offset}"); + self.durable_offset.send_modify(|val| { + val.replace(*new_offset); + }); + } + }) } } diff --git a/crates/durability/tests/io/fallocate.rs b/crates/durability/tests/io/fallocate.rs index 64e50faf4cc..2783b2178ec 100644 --- a/crates/durability/tests/io/fallocate.rs +++ b/crates/durability/tests/io/fallocate.rs @@ -161,7 +161,7 @@ async fn local_durability( ) -> Result, spacetimedb_durability::local::OpenError> { spacetimedb_durability::Local::open( dir, - tokio::runtime::Handle::current(), + spacetimedb_runtime::Runtime::tokio_current(), spacetimedb_durability::local::Options { commitlog: spacetimedb_commitlog::Options { max_segment_size, diff --git a/crates/io/LICENSE b/crates/io/LICENSE new file mode 120000 index 00000000000..8540cf8a991 --- /dev/null +++ b/crates/io/LICENSE @@ -0,0 +1 @@ +../../licenses/BSL.txt \ No newline at end of file diff --git a/crates/runtime/Cargo.toml b/crates/runtime/Cargo.toml new file mode 100644 index 00000000000..4cd0af60869 --- /dev/null +++ b/crates/runtime/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "spacetimedb-runtime" +version.workspace = true +edition.workspace = true +license-file = "LICENSE" +description = "Runtime and deterministic simulation utilities for SpacetimeDB" +rust-version.workspace = true + +[lints] +workspace = true + +[dependencies] +tokio = { workspace = true, optional = true } +async-task = { version = "4.4", default-features = false, optional = true } +spin = { version = "0.9", default-features = false, features = ["mutex", "spin_mutex"], optional = true } +libc = { version = "0.2", optional = true } + +[dev-dependencies] +futures.workspace = true + +[features] +default = ["tokio"] +tokio = ["dep:tokio"] +simulation = ["dep:async-task", "dep:spin", "dep:libc"] diff --git a/crates/runtime/DETERMINISM_COVERAGE.md b/crates/runtime/DETERMINISM_COVERAGE.md new file mode 100644 index 00000000000..076efb865e4 --- /dev/null +++ b/crates/runtime/DETERMINISM_COVERAGE.md @@ -0,0 +1,50 @@ +# Determinism Coverage + +This document tracks which sources of nondeterminism are under control in `spacetimedb-runtime`, which ones are only constrained by current architecture, and which ones still escape the simulator boundary. + +It is meant to serve two purposes: + +1. Make the current determinism boundary explicit for runtime code, core crates, and DST harnesses. +2. Provide a place to record and review assumptions when a PR changes that boundary. + +## Status Definitions + +- `Controlled` + The simulator or runtime owns this source of nondeterminism directly. Given the same seed and the same simulated inputs, behavior should replay the same way. + +- `Constrained` + This surface is not fully simulator-controlled, but the current architecture limits how it is used. Replay should remain stable if those constraints continue to hold. + +- `Audited` + This surface is not mechanically controlled. Current usage has been reviewed and is believed not to affect replay, but that guarantee depends on call patterns and can regress. + +- `Known Leak` + This source can currently escape simulator control and affect replay. It should be treated as explicit technical debt or a documented exception. + +- `Out of Scope` + This crate does not try to control this surface. If it matters for DST, it must be modeled by a higher-level abstraction or test harness. + +## Control Matrix + +| Surface | Status | Boundary | Current control or assumption | Failure mode if violated | Required direction | +| --- | --- | --- | --- | --- | --- | +| Executor scheduling | Controlled | `runtime::sim::executor` | Runnable selection is driven by seeded simulator RNG | Replay diverges across runs | - | +| Virtual time and timers | Controlled | `runtime::sim::time` | Simulated time advances only through explicit advance or next-timer jump | Timeouts and ordering become host-timing dependent | - | +| Runtime RNG and buggify | Controlled | `runtime::sim::rng` | Runtime RNG drives scheduler and probabilistic fault-injection decisions | RNG and fault decisions are not replayable | - | +| OS thread creation during simulation | Controlled | `runtime::sim_std` | Unix thread hook rejects `std::thread::spawn` while simulation is active | Host scheduler escapes simulator control | - | +| OS entropy | Known Leak | `runtime::sim_std` | Randomness requests warn and then delegate to the OS | Same seed can produce different traces | Add backtrace to warnings, remove call sites, eventually fail closed or fully model the source | +| `HashMap` randomized iteration | Audited | Runtime and caller code | Runtime does not force deterministic hash seeding; correctness must not depend on iteration order | Hidden ordering dependencies cause flaky replay | Prefer ordered maps or explicit sorting where observable order matters | +| `tokio::sync` primitives | Constrained | Core crates above runtime | These can be replay-compatible only when all participating tasks remain simulator-owned and progress stays on simulator-controlled async paths | Wake ordering or blocking semantics diverge once code depends on a real runtime or host-driven progress | Audit per primitive and push deep-core paths toward runtime-owned or single-threaded structures | +| `parking_lot::{}` and `std::sync::{}` | Constrained | Core crates, especially datastore | Safe only where access stays single-threaded or non-contended under DST | Host synchronization leaks nondeterministic acquisition order | Keep out of deep-core execution paths; prefer runtime-owned or single-threaded structures | +| File and network I/O | Out of Scope | Runtime crate | Runtime does not simulate filesystem or network behavior | Real I/O timing, ordering, and errors are not replayable | Model via domain-specific DST abstractions | +| Heap allocation and OOM | Known Leak | Broad, especially deep-core direction | Allocation happens through normal Rust paths; deterministic allocation failure is not modeled | Resource-exhaustion behavior is not reproducible | Move the simulation core and eventually deep-core paths toward `no_std + alloc` with explicit allocation boundaries | +| Snapshot / commitlog / datastore host effects | Out of Scope | Higher-level durability and storage layers | Runtime only provides scheduling, time, and fault-decision primitives | Storage semantics depend on real host behavior unless wrapped | Model durable behavior through domain-specific DST abstractions | + +## Update Rule + +A PR should update this document if it: + +- introduces a new source of nondeterminism, +- changes the control status of an existing surface, +- adds a new assumption about single-threading, iteration order, runtime ownership, or host behavior, or +- removes a leak or upgrades a surface from `Audited` or `Constrained` to `Controlled`. diff --git a/crates/runtime/LICENSE b/crates/runtime/LICENSE new file mode 100644 index 00000000000..daef5135277 --- /dev/null +++ b/crates/runtime/LICENSE @@ -0,0 +1,731 @@ +SPACETIMEDB BUSINESS SOURCE LICENSE AGREEMENT + +Business Source License 1.1 + +Parameters + +Licensor: Clockwork Laboratories, Inc. +Licensed Work: SpacetimeDB 2.2.0 + The Licensed Work is + (c) 2023 Clockwork Laboratories, Inc. + +Additional Use Grant: You may make use of the Licensed Work provided your + application or service uses the Licensed Work with no + more than one SpacetimeDB instance in production and + provided that you do not use the Licensed Work for a + Database Service. + + A “Database Service” is a commercial offering that + allows third parties (other than your employees and + contractors) to access the functionality of the + Licensed Work by creating tables whose schemas are + controlled by such third parties. + +Change Date: 2031-04-29 + +Change License: GNU Affero General Public License v3.0 with a linking + exception + +For information about alternative licensing arrangements for the Software, +please visit: https://spacetimedb.com + +Notice + +The Business Source License (this document, or the “License”) is not an Open +Source license. However, the Licensed Work will eventually be made available +under an Open Source License, as stated in this License. + +License text copyright (c) 2017 MariaDB Corporation Ab, All Rights Reserved. +“Business Source License” is a trademark of MariaDB Corporation Ab. + +----------------------------------------------------------------------------- + +Business Source License 1.1 + +Terms + +The Licensor hereby grants you the right to copy, modify, create derivative +works, redistribute, and make non-production use of the Licensed Work. The +Licensor may make an Additional Use Grant, above, permitting limited +production use. + +Effective on the Change Date, or the fourth anniversary of the first publicly +available distribution of a specific version of the Licensed Work under this +License, whichever comes first, the Licensor hereby grants you rights under +the terms of the Change License, and the rights granted in the paragraph +above terminate. + +If your use of the Licensed Work does not comply with the requirements +currently in effect as described in this License, you must purchase a +commercial license from the Licensor, its affiliated entities, or authorized +resellers, or you must refrain from using the Licensed Work. + +All copies of the original and modified Licensed Work, and derivative works +of the Licensed Work, are subject to this License. This License applies +separately for each version of the Licensed Work and the Change Date may vary +for each version of the Licensed Work released by Licensor. + +You must conspicuously display this License on each original or modified copy +of the Licensed Work. If you receive the Licensed Work in original or +modified form from a third party, the terms and conditions set forth in this +License apply to your use of that work. + +Any use of the Licensed Work in violation of this License will automatically +terminate your rights under this License for the current and all other +versions of the Licensed Work. + +This License does not grant you any right in any trademark or logo of +Licensor or its affiliates (provided that you may use a trademark or logo of +Licensor as expressly required by this License). + +TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON +AN “AS IS” BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, +EXPRESS OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND +TITLE. + +MariaDB hereby grants you permission to use this License’s text to license +your works, and to refer to it using the trademark “Business Source License”, +as long as you comply with the Covenants of Licensor below. + +Covenants of Licensor + +In consideration of the right to use this License’s text and the “Business +Source License” name and trademark, Licensor covenants to MariaDB, and to all +other recipients of the licensed work to be provided by Licensor: + +1. To specify as the Change License the GPL Version 2.0 or any later version, + or a license that is compatible with GPL Version 2.0 or a later version, + where “compatible” means that software provided under the Change License can + be included in a program with software provided under GPL Version 2.0 or a + later version. Licensor may specify additional Change Licenses without + limitation. + +2. To either: (a) specify an additional grant of rights to use that does not + impose any additional restriction on the right granted in this License, as + the Additional Use Grant; or (b) insert the text “None”. + +3. To specify a Change Date. + +4. Not to modify this License in any other way. + +----------------------------------------------------------------------------- + +Copyright (C) 2023 Clockwork Laboratories, Inc. + +This program is free software: you can redistribute it and/or modify it under +the terms of the GNU Affero General Public License, version 3, as published +by the Free Software Foundation. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU Affero General Public License +along with this program; if not, see . + +Additional permission under GNU GPL version 3 section 7 + +If you modify this Program, or any covered work, by linking or combining it +with SpacetimeDB (or a modified version of that library), containing parts +covered by the terms of the AGPL v3.0, the licensors of this Program grant +you additional permission to convey the resulting work. + +Additional permission under GNU AGPL version 3 section 13 + +If you modify this Program, or any covered work, by linking or combining it +with SpacetimeDB (or a modified version of that library), containing parts +covered by the terms of the AGPL v3.0, the licensors of this Program grant +you additional permission that, notwithstanding any other provision of this +License, you need not prominently offer all users interacting with your +modified version remotely through a computer network an opportunity to +receive the Corresponding Source of your version from a network server at no +charge, if your version supports such interaction. This permission does not +waive or modify any other obligations or terms of the AGPL v3.0, except for +the specific requirement set forth in section 13. + +A copy of the AGPL v3.0 license is reproduced below. + + GNU AFFERO GENERAL PUBLIC LICENSE + Version 3, 19 November 2007 + +Copyright © 2007 Free Software Foundation, Inc. +Everyone is permitted to copy and distribute verbatim copies of this license +document, but changing it is not allowed. + +Preamble +The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. + +The licenses for most software and other practical works are designed to take +away your freedom to share and change the works. By contrast, our General +Public Licenses are intended to guarantee your freedom to share and change +all versions of a program--to make sure it remains free software for all its +users. + +When we speak of free software, we are referring to freedom, not price. Our +General Public Licenses are designed to make sure that you have the freedom +to distribute copies of free software (and charge for them if you wish), that +you receive source code or can get it if you want it, that you can change the +software or use pieces of it in new free programs, and that you know you can +do these things. + +Developers that use our General Public Licenses protect your rights with two +steps: (1) assert copyright on the software, and (2) offer you this License +which gives you legal permission to copy, distribute and/or modify the +software. + +A secondary benefit of defending all users' freedom is that improvements made +in alternate versions of the program, if they receive widespread use, become +available for other developers to incorporate. Many developers of free +software are heartened and encouraged by the resulting cooperation. However, +in the case of software used on network servers, this result may fail to come +about. The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its source +code to the public. + +The GNU Affero General Public License is designed specifically to ensure +that, in such cases, the modified source code becomes available to the +community. It requires the operator of a network server to provide the source +code of the modified version running there to the users of that server. +Therefore, public use of a modified version, on a publicly accessible server, +gives the public access to the source code of the modified version. + +An older license, called the Affero General Public License and published by +Affero, was designed to accomplish similar goals. This is a different +license, not a version of the Affero GPL, but Affero has released a new +version of the Affero GPL which permits relicensing under this license. + +The precise terms and conditions for copying, distribution and modification +follow. + +TERMS AND CONDITIONS +0. Definitions. +"This License" refers to version 3 of the GNU Affero General Public License. + +"Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + +"The Program" refers to any copyrightable work licensed under this License. +Each licensee is addressed as "you". "Licensees" and "recipients" may be +individuals or organizations. + +To "modify" a work means to copy from or adapt all or part of the work in a +fashion requiring copyright permission, other than the making of an exact +copy. The resulting work is called a "modified version" of the earlier work +or a work "based on" the earlier work. + +A "covered work" means either the unmodified Program or a work based on the +Program. + +To "propagate" a work means to do anything with it that, without permission, +would make you directly or secondarily liable for infringement under +applicable copyright law, except executing it on a computer or modifying a +private copy. Propagation includes copying, distribution (with or without +modification), making available to the public, and in some countries other +activities as well. + +To "convey" a work means any kind of propagation that enables other parties +to make or receive copies. Mere interaction with a user through a computer +network, with no transfer of a copy, is not conveying. + +An interactive user interface displays "Appropriate Legal Notices" to the +extent that it includes a convenient and prominently visible feature that (1) +displays an appropriate copyright notice, and (2) tells the user that there +is no warranty for the work (except to the extent that warranties are +provided), that licensees may convey the work under this License, and how to +view a copy of this License. If the interface presents a list of user +commands or options, such as a menu, a prominent item in the list meets this +criterion. + +1. Source Code. +The "source code" for a work means the preferred form of the work for making +modifications to it. "Object code" means any non-source form of a work. + +A "Standard Interface" means an interface that either is an official standard +defined by a recognized standards body, or, in the case of interfaces +specified for a particular programming language, one that is widely used +among developers working in that language. + +The "System Libraries" of an executable work include anything, other than the +work as a whole, that (a) is included in the normal form of packaging a Major +Component, but which is not part of that Major Component, and (b) serves only +to enable use of the work with that Major Component, or to implement a +Standard Interface for which an implementation is available to the public in +source code form. A "Major Component", in this context, means a major +essential component (kernel, window system, and so on) of the specific +operating system (if any) on which the executable work runs, or a compiler +used to produce the work, or an object code interpreter used to run it. + +The "Corresponding Source" for a work in object code form means all the +source code needed to generate, install, and (for an executable work) run the +object code and to modify the work, including scripts to control those +activities. However, it does not include the work's System Libraries, or +general-purpose tools or generally available free programs which are used +unmodified in performing those activities but which are not part of the work. +For example, Corresponding Source includes interface definition files +associated with source files for the work, and the source code for shared +libraries and dynamically linked subprograms that the work is specifically +designed to require, such as by intimate data communication or control flow +between those subprograms and other parts of the work. + +The Corresponding Source need not include anything that users can regenerate +automatically from other parts of the Corresponding Source. + +The Corresponding Source for a work in source code form is that same work. + +2. Basic Permissions. +All rights granted under this License are granted for the term of copyright +on the Program, and are irrevocable provided the stated conditions are met. +This License explicitly affirms your unlimited permission to run the +unmodified Program. The output from running a covered work is covered by this +License only if the output, given its content, constitutes a covered work. +This License acknowledges your rights of fair use or other equivalent, as +provided by copyright law. + +You may make, run and propagate covered works that you do not convey, without +conditions so long as your license otherwise remains in force. You may convey +covered works to others for the sole purpose of having them make +modifications exclusively for you, or provide you with facilities for running +those works, provided that you comply with the terms of this License in +conveying all material for which you do not control copyright. Those thus +making or running the covered works for you must do so exclusively on your +behalf, under your direction and control, on terms that prohibit them from +making any copies of your copyrighted material outside their relationship +with you. + +Conveying under any other circumstances is permitted solely under the +conditions stated below. Sublicensing is not allowed; section 10 makes it +unnecessary. + +3. Protecting Users' Legal Rights From Anti-Circumvention Law. +No covered work shall be deemed part of an effective technological measure +under any applicable law fulfilling obligations under article 11 of the WIPO +copyright treaty adopted on 20 December 1996, or similar laws prohibiting or +restricting circumvention of such measures. + +When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention is +effected by exercising rights under this License with respect to the covered +work, and you disclaim any intention to limit operation or modification of +the work as a means of enforcing, against the work's users, your or third +parties' legal rights to forbid circumvention of technological measures. + +4. Conveying Verbatim Copies. +You may convey verbatim copies of the Program's source code as you receive +it, in any medium, provided that you conspicuously and appropriately publish +on each copy an appropriate copyright notice; keep intact all notices stating +that this License and any non-permissive terms added in accord with section 7 +apply to the code; keep intact all notices of the absence of any warranty; +and give all recipients a copy of this License along with the Program. + +You may charge any price or no price for each copy that you convey, and you +may offer support or warranty protection for a fee. + +5. Conveying Modified Source Versions. +You may convey a work based on the Program, or the modifications to produce +it from the Program, in the form of source code under the terms of section 4, +provided that you also meet all of these conditions: + +a) The work must carry prominent notices stating that you modified it, and +giving a relevant date. +b) The work must carry prominent notices stating that it is released under +this License and any conditions added under section 7. This requirement +modifies the requirement in section 4 to "keep intact all notices". +c) You must license the entire work, as a whole, under this License to anyone +who comes into possession of a copy. This License will therefore apply, along +with any applicable section 7 additional terms, to the whole of the work, and +all its parts, regardless of how they are packaged. This License gives no +permission to license the work in any other way, but it does not invalidate +such permission if you have separately received it. +d) If the work has interactive user interfaces, each must display Appropriate +Legal Notices; however, if the Program has interactive interfaces that do not +display Appropriate Legal Notices, your work need not make them do so. +A compilation of a covered work with other separate and independent works, +which are not by their nature extensions of the covered work, and which are +not combined with it such as to form a larger program, in or on a volume of a +storage or distribution medium, is called an "aggregate" if the compilation +and its resulting copyright are not used to limit the access or legal rights +of the compilation's users beyond what the individual works permit. Inclusion +of a covered work in an aggregate does not cause this License to apply to the +other parts of the aggregate. + +6. Conveying Non-Source Forms. +You may convey a covered work in object code form under the terms of sections +4 and 5, provided that you also convey the machine-readable Corresponding +Source under the terms of this License, in one of these ways: + +a) Convey the object code in, or embodied in, a physical product (including a +physical distribution medium), accompanied by the Corresponding Source fixed +on a durable physical medium customarily used for software interchange. +b) Convey the object code in, or embodied in, a physical product (including a +physical distribution medium), accompanied by a written offer, valid for at +least three years and valid for as long as you offer spare parts or customer +support for that product model, to give anyone who possesses the object code +either (1) a copy of the Corresponding Source for all the software in the +product that is covered by this License, on a durable physical medium +customarily used for software interchange, for a price no more than your +reasonable cost of physically performing this conveying of source, or (2) +access to copy the Corresponding Source from a network server at no charge. +c) Convey individual copies of the object code with a copy of the written +offer to provide the Corresponding Source. This alternative is allowed only +occasionally and noncommercially, and only if you received the object code +with such an offer, in accord with subsection 6b. +d) Convey the object code by offering access from a designated place (gratis +or for a charge), and offer equivalent access to the Corresponding Source in +the same way through the same place at no further charge. You need not +require recipients to copy the Corresponding Source along with the object +code. If the place to copy the object code is a network server, the +Corresponding Source may be on a different server (operated by you or a third +party) that supports equivalent copying facilities, provided you maintain +clear directions next to the object code saying where to find the +Corresponding Source. Regardless of what server hosts the Corresponding +Source, you remain obligated to ensure that it is available for as long as +needed to satisfy these requirements. +e) Convey the object code using peer-to-peer transmission, provided you +inform other peers where the object code and Corresponding Source of the work +are being offered to the general public at no charge under subsection 6d. +A separable portion of the object code, whose source code is excluded from +the Corresponding Source as a System Library, need not be included in +conveying the object code work. + +A "User Product" is either (1) a "consumer product", which means any tangible +personal property which is normally used for personal, family, or household +purposes, or (2) anything designed or sold for incorporation into a dwelling. +In determining whether a product is a consumer product, doubtful cases shall +be resolved in favor of coverage. For a particular product received by a +particular user, "normally used" refers to a typical or common use of that +class of product, regardless of the status of the particular user or of the +way in which the particular user actually uses, or expects or is expected to +use, the product. A product is a consumer product regardless of whether the +product has substantial commercial, industrial or non-consumer uses, unless +such uses represent the only significant mode of use of the product. + +"Installation Information" for a User Product means any methods, procedures, +authorization keys, or other information required to install and execute +modified versions of a covered work in that User Product from a modified +version of its Corresponding Source. The information must suffice to ensure +that the continued functioning of the modified object code is in no case +prevented or interfered with solely because modification has been made. + +If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as part of +a transaction in which the right of possession and use of the User Product is +transferred to the recipient in perpetuity or for a fixed term (regardless of +how the transaction is characterized), the Corresponding Source conveyed +under this section must be accompanied by the Installation Information. But +this requirement does not apply if neither you nor any third party retains +the ability to install modified object code on the User Product (for example, +the work has been installed in ROM). + +The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates for +a work that has been modified or installed by the recipient, or for the User +Product in which it has been modified or installed. Access to a network may +be denied when the modification itself materially and adversely affects the +operation of the network or violates the rules and protocols for +communication across the network. + +Corresponding Source conveyed, and Installation Information provided, in +accord with this section must be in a format that is publicly documented (and +with an implementation available to the public in source code form), and must +require no special password or key for unpacking, reading or copying. + +7. Additional Terms. +"Additional permissions" are terms that supplement the terms of this License +by making exceptions from one or more of its conditions. Additional +permissions that are applicable to the entire Program shall be treated as +though they were included in this License, to the extent that they are valid +under applicable law. If additional permissions apply only to part of the +Program, that part may be used separately under those permissions, but the +entire Program remains governed by this License without regard to the +additional permissions. + +When you convey a copy of a covered work, you may at your option remove any +additional permissions from that copy, or from any part of it. (Additional +permissions may be written to require their own removal in certain cases when +you modify the work.) You may place additional permissions on material, added +by you to a covered work, for which you have or can give appropriate +copyright permission. + +Notwithstanding any other provision of this License, for material you add to +a covered work, you may (if authorized by the copyright holders of that +material) supplement the terms of this License with terms: + +a) Disclaiming warranty or limiting liability differently from the terms of +sections 15 and 16 of this License; or +b) Requiring preservation of specified reasonable legal notices or author +attributions in that material or in the Appropriate Legal Notices displayed +by works containing it; or +c) Prohibiting misrepresentation of the origin of that material, or requiring +that modified versions of such material be marked in reasonable ways as +different from the original version; or +d) Limiting the use for publicity purposes of names of licensors or authors +of the material; or +e) Declining to grant rights under trademark law for use of some trade names, +trademarks, or service marks; or +f) Requiring indemnification of licensors and authors of that material by +anyone who conveys the material (or modified versions of it) with contractual +assumptions of liability to the recipient, for any liability that these +contractual assumptions directly impose on those licensors and authors. +All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is governed +by this License along with a term that is a further restriction, you may +remove that term. If a license document contains a further restriction but +permits relicensing or conveying under this License, you may add to a covered +work material governed by the terms of that license document, provided that +the further restriction does not survive such relicensing or conveying. + +If you add terms to a covered work in accord with this section, you must +place, in the relevant source files, a statement of the additional terms that +apply to those files, or a notice indicating where to find the applicable +terms. + +Additional terms, permissive or non-permissive, may be stated in the form of +a separately written license, or stated as exceptions; the above requirements +apply either way. + +8. Termination. +You may not propagate or modify a covered work except as expressly provided +under this License. Any attempt otherwise to propagate or modify it is void, +and will automatically terminate your rights under this License (including +any patent licenses granted under the third paragraph of section 11). + +However, if you cease all violation of this License, then your license from a +particular copyright holder is reinstated (a) provisionally, unless and until +the copyright holder explicitly and finally terminates your license, and (b) +permanently, if the copyright holder fails to notify you of the violation by +some reasonable means prior to 60 days after the cessation. + +Moreover, your license from a particular copyright holder is reinstated +permanently if the copyright holder notifies you of the violation by some +reasonable means, this is the first time you have received notice of +violation of this License (for any work) from that copyright holder, and you +cure the violation prior to 30 days after your receipt of the notice. + +Termination of your rights under this section does not terminate the licenses +of parties who have received copies or rights from you under this License. If +your rights have been terminated and not permanently reinstated, you do not +qualify to receive new licenses for the same material under section 10. + +9. Acceptance Not Required for Having Copies. +You are not required to accept this License in order to receive or run a copy +of the Program. Ancillary propagation of a covered work occurring solely as a +consequence of using peer-to-peer transmission to receive a copy likewise +does not require acceptance. However, nothing other than this License grants +you permission to propagate or modify any covered work. These actions +infringe copyright if you do not accept this License. Therefore, by modifying +or propagating a covered work, you indicate your acceptance of this License +to do so. + +10. Automatic Licensing of Downstream Recipients. +Each time you convey a covered work, the recipient automatically receives a +license from the original licensors, to run, modify and propagate that work, +subject to this License. You are not responsible for enforcing compliance by +third parties with this License. + +An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered work +results from an entity transaction, each party to that transaction who +receives a copy of the work also receives whatever licenses to the work the +party's predecessor in interest had or could give under the previous +paragraph, plus a right to possession of the Corresponding Source of the work +from the predecessor in interest, if the predecessor has it or can get it +with reasonable efforts. + +You may not impose any further restrictions on the exercise of the rights +granted or affirmed under this License. For example, you may not impose a +license fee, royalty, or other charge for exercise of rights granted under +this License, and you may not initiate litigation (including a cross-claim or +counterclaim in a lawsuit) alleging that any patent claim is infringed by +making, using, selling, offering for sale, or importing the Program or any +portion of it. + +11. Patents. +A "contributor" is a copyright holder who authorizes use under this License +of the Program or a work on which the Program is based. The work thus +licensed is called the contributor's "contributor version". + +A contributor's "essential patent claims" are all patent claims owned or +controlled by the contributor, whether already acquired or hereafter +acquired, that would be infringed by some manner, permitted by this License, +of making, using, or selling its contributor version, but do not include +claims that would be infringed only as a consequence of further modification +of the contributor version. For purposes of this definition, "control" +includes the right to grant patent sublicenses in a manner consistent with +the requirements of this License. + +Each contributor grants you a non-exclusive, worldwide, royalty-free patent +license under the contributor's essential patent claims, to make, use, sell, +offer for sale, import and otherwise run, modify and propagate the contents +of its contributor version. + +In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent (such +as an express permission to practice a patent or covenant not to sue for +patent infringement). To "grant" such a patent license to a party means to +make such an agreement or commitment not to enforce a patent against the +party. + +If you convey a covered work, knowingly relying on a patent license, and the +Corresponding Source of the work is not available for anyone to copy, free of +charge and under the terms of this License, through a publicly available +network server or other readily accessible means, then you must either (1) +cause the Corresponding Source to be so available, or (2) arrange to deprive +yourself of the benefit of the patent license for this particular work, or +(3) arrange, in a manner consistent with the requirements of this License, to +extend the patent license to downstream recipients. "Knowingly relying" means +you have actual knowledge that, but for the patent license, your conveying +the covered work in a country, or your recipient's use of the covered work in +a country, would infringe one or more identifiable patents in that country +that you have reason to believe are valid. + +If, pursuant to or in connection with a single transaction or arrangement, +you convey, or propagate by procuring conveyance of, a covered work, and +grant a patent license to some of the parties receiving the covered work +authorizing them to use, propagate, modify or convey a specific copy of the +covered work, then the patent license you grant is automatically extended to +all recipients of the covered work and works based on it. + +A patent license is "discriminatory" if it does not include within the scope +of its coverage, prohibits the exercise of, or is conditioned on the +non-exercise of one or more of the rights that are specifically granted under +this License. You may not convey a covered work if you are a party to an +arrangement with a third party that is in the business of distributing +software, under which you make payment to the third party based on the extent +of your activity of conveying the work, and under which the third party +grants, to any of the parties who would receive the covered work from you, a +discriminatory patent license (a) in connection with copies of the covered +work conveyed by you (or copies made from those copies), or (b) primarily for +and in connection with specific products or compilations that contain the +covered work, unless you entered into that arrangement, or that patent +license was granted, prior to 28 March 2007. + +Nothing in this License shall be construed as excluding or limiting any +implied license or other defenses to infringement that may otherwise be +available to you under applicable patent law. + +12. No Surrender of Others' Freedom. +If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not excuse +you from the conditions of this License. If you cannot convey a covered work +so as to satisfy simultaneously your obligations under this License and any +other pertinent obligations, then as a consequence you may not convey it at +all. For example, if you agree to terms that obligate you to collect a +royalty for further conveying from those to whom you convey the Program, the +only way you could satisfy both those terms and this License would be to +refrain entirely from conveying the Program. + +13. Remote Network Interaction; Use with the GNU General Public License. +Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users interacting +with it remotely through a computer network (if your version supports such +interaction) an opportunity to receive the Corresponding Source of your +version by providing access to the Corresponding Source from a network server +at no charge, through some standard or customary means of facilitating +copying of software. This Corresponding Source shall include the +Corresponding Source for any work covered by version 3 of the GNU General +Public License that is incorporated pursuant to the following paragraph. + +Notwithstanding any other provision of this License, you have permission to +link or combine any covered work with a work licensed under version 3 of the +GNU General Public License into a single combined work, and to convey the +resulting work. The terms of this License will continue to apply to the part +which is the covered work, but the work with which it is combined will remain +governed by version 3 of the GNU General Public License. + +14. Revised Versions of this License. +The Free Software Foundation may publish revised and/or new versions of the +GNU Affero General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies that a certain numbered version of the GNU Affero General Public +License "or any later version" applies to it, you have the option of +following the terms and conditions either of that numbered version or of any +later version published by the Free Software Foundation. If the Program does +not specify a version number of the GNU Affero General Public License, you +may choose any version ever published by the Free Software Foundation. + +If the Program specifies that a proxy can decide which future versions of the +GNU Affero General Public License can be used, that proxy's public statement +of acceptance of a version permanently authorizes you to choose that version +for the Program. + +Later license versions may give you additional or different permissions. +However, no additional obligations are imposed on any author or copyright +holder as a result of your choosing to follow a later version. + +15. Disclaimer of Warranty. +THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE +LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, +EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE +ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. +SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY +SERVICING, REPAIR OR CORRECTION. + +16. Limitation of Liability. +IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL +ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE +PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE +OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR +DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR +A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH +HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + +17. Interpretation of Sections 15 and 16. +If the disclaimer of warranty and limitation of liability provided above +cannot be given local legal effect according to their terms, reviewing courts +shall apply local law that most closely approximates an absolute waiver of +all civil liability in connection with the Program, unless a warranty or +assumption of liability accompanies a copy of the Program in return for a +fee. + +END OF TERMS AND CONDITIONS + +How to Apply These Terms to Your New Programs +If you develop a new program, and you want it to be of the greatest possible +use to the public, the best way to achieve this is to make it free software +which everyone can redistribute and change under these terms. + +To do so, attach the following notices to the program. It is safest to attach +them to the start of each source file to most effectively state the exclusion +of warranty; and each file should have at least the "copyright" line and a +pointer to where the full notice is found. + +SpacetimeDB: A database which replaces your server. +Copyright (C) 2023 Clockwork Laboratories, Inc. + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +Also add information on how to contact you by electronic and paper mail. + +If your software can interact with users remotely through a computer network, +you should also make sure that it provides a way for users to get its source. +For example, if your program is a web application, its interface could +display a "Source" link that leads users to an archive of the code. There are +many ways you could offer source, and different solutions will be better for +different programs; see section 13 for the specific requirements. + +You should also get your employer (if you work as a programmer) or school, if +any, to sign a "copyright disclaimer" for the program, if necessary. For more +information on this, and how to apply and follow the GNU AGPL, see +. diff --git a/crates/runtime/README.md b/crates/runtime/README.md new file mode 100644 index 00000000000..68037d752bf --- /dev/null +++ b/crates/runtime/README.md @@ -0,0 +1,61 @@ +> Welcome to the Matrix! + +# spacetimedb-runtime + +`spacetimedb-runtime` is a runtime boundary that lets SpacetimeDB core code run under deterministic simulation testing (DST). + +DST runs code inside a deterministic simulator that controls nondeterministic inputs instead of letting them come directly from the OS and real runtime environment. Given the same seed, the simulator should produce the same trace. When it finds a bug, the seed should be enough to reproduce that bug exactly. + +For this to work, code under test must not read clocks, randomness, scheduling, I/O, or network behavior directly from the outer environment. Those effects need interfaces that production can implement with real runtime-backed services and DST can replace with simulated ones. + +This crate provides the execution-control part of that boundary: spawning, timeouts, virtual time, deterministic randomness, task scheduling, and fault decisions. Storage, networking, and replication should be modeled through higher-level abstractions. + +For a tracked view of what is currently under simulator control, what is only constrained by convention, and what still leaks host behavior, see [DETERMINISM_COVERAGE.md](./DETERMINISM_COVERAGE.md). + +## Architecture + +[src/lib.rs](./src/lib.rs) exposes `Handle`, a small runtime handle shared code carries. It has two variants: + +- `Handle::Tokio(TokioHandle)` for real runtime execution. +- `Handle::Simulation(sim::Handle)` for deterministic simulation. + +[src/sim](./src/sim) contains the simulation core. It is single-threaded and targets `no_std + alloc`. The module includes: + +- `executor`: single-threaded task scheduler with deterministic runnable selection. +- `time`: virtual clock, sleeps, and timeouts. +- `rng`: seeded deterministic randomness for scheduler and workload decisions. +- `buggify`: fault-injection surface. Calls rng to decide probabilistically whether to inject failures into simulated operations. +- `node`: node builders and node-local scheduling handles. + +[src/sim_std.rs](./src/sim_std.rs) contains `std`/OS glue around the simulator: + +- `block_on` installs simulation guards for tests running in a normal process. +- `check_determinism` replays the same seeded workload twice and compares traces. +- libc randomness hooks warn and delegate if code reaches OS entropy. +- Unix thread hooks reject accidental `std::thread::spawn` while simulation is active. + +Tokio integration is intentionally small and lives directly in [src/lib.rs](./src/lib.rs). + +Feature flags: + +- `tokio`: enables the Tokio runtime backend and remains in the default feature set. +- `simulation`: enables the deterministic simulation runtime and `sim_std` helpers. + +## Design Principles + +- **Single-threaded runtime.** The simulator exposes interleaving and timeout bugs, but not bugs that require true parallel execution. The direction is to keep deep-core code single-threaded or close to thread-per-core; simulating real parallelism is out of scope. + +- **No built-in network, storage, or I/O simulation.** This crate provides deterministic execution primitives only. Higher-level harnesses should model message delivery, disk behavior, and failures. + +- **Not a Tokio replacement.** This crate does not aim to simulate APIs like `tokio::net` or `tokio::fs`. Code that depends on them needs a higher-level abstraction boundary. + +- **Zero dependency.** The simulation core in `sim/` is already `no_std + alloc`. The `sim_std` module is a thin OS-facing wrapper — the std dependency lives there, not in the simulation core itself. It stays until the application logic above this crate also moves to `no_std`. + +## Current Limitations + + +- **One shared virtual clock.** All simulated nodes share a single clock. This masks bugs related to timing mismatch across machines. + +- **No good alternative for blocking APIs.** The simulation backend has no `spawn_blocking` pool or OS thread escape hatch. API like `spawn_blocking` or `Handle::block_on` delegate to the single executor thread, so blocking inside them stalls all simulated tasks. The direction is to avoid relying on blocking semantics inside the simulation boundary. + +- **OS randomness is not controlled.** `sim_std` warns if code reaches OS entropy. The direction is to keep application code and testing harnesses off OS randomness entirely. diff --git a/crates/runtime/src/lib.rs b/crates/runtime/src/lib.rs new file mode 100644 index 00000000000..5611d5db3ed --- /dev/null +++ b/crates/runtime/src/lib.rs @@ -0,0 +1,336 @@ +#[cfg(feature = "simulation")] +extern crate alloc; + +use core::{ + fmt, + future::Future, + marker::PhantomData, + pin::Pin, + task::{Context, Poll}, + time::Duration, +}; + +#[cfg(feature = "simulation")] +pub mod sim; +#[cfg(feature = "simulation")] +pub mod sim_std; + +#[cfg(feature = "tokio")] +pub type TokioHandle = tokio::runtime::Handle; + +#[derive(Clone)] +pub enum Handle { + #[cfg(feature = "tokio")] + Tokio(TokioHandle), + #[cfg(feature = "simulation")] + Simulation(sim::Handle), +} + +pub struct JoinHandle { + inner: JoinHandleInner, +} + +pub struct AbortHandle { + inner: AbortHandleInner, +} + +enum JoinHandleInner { + #[cfg(feature = "tokio")] + Tokio(tokio::task::JoinHandle), + #[cfg(feature = "simulation")] + Simulation(sim::JoinHandle), + Detached(PhantomData), +} + +enum AbortHandleInner { + #[cfg(feature = "tokio")] + Tokio(tokio::task::AbortHandle), + #[cfg(feature = "simulation")] + Simulation(sim::AbortHandle), +} + +#[derive(Debug)] +pub struct JoinError { + inner: JoinErrorInner, +} + +#[derive(Debug)] +enum JoinErrorInner { + #[cfg(feature = "tokio")] + Tokio(tokio::task::JoinError), + #[cfg(feature = "simulation")] + Simulation(sim::JoinError), +} + +impl AbortHandle { + pub fn abort(&self) { + match &self.inner { + #[cfg(feature = "tokio")] + AbortHandleInner::Tokio(handle) => handle.abort(), + #[cfg(feature = "simulation")] + AbortHandleInner::Simulation(handle) => handle.abort(), + #[cfg(not(any(feature = "tokio", feature = "simulation")))] + _ => unreachable!("runtime abort handle has no enabled backend"), + } + } +} + +impl JoinErrorInner { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + #[cfg(feature = "tokio")] + Self::Tokio(err) => fmt::Display::fmt(err, f), + #[cfg(feature = "simulation")] + Self::Simulation(err) => fmt::Display::fmt(err, f), + } + } +} + +impl fmt::Display for JoinError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + #[cfg(not(any(feature = "tokio", feature = "simulation")))] + let _ = f; + #[cfg(any(feature = "tokio", feature = "simulation"))] + return self.inner.fmt(f); + #[cfg(not(any(feature = "tokio", feature = "simulation")))] + unreachable!("runtime join error has no enabled backend") + } +} + +#[cfg(any(feature = "tokio", feature = "simulation"))] +impl std::error::Error for JoinError {} + +impl JoinHandleInner { + fn abort_handle(&self) -> AbortHandle { + match self { + #[cfg(feature = "tokio")] + Self::Tokio(handle) => AbortHandle { + inner: AbortHandleInner::Tokio(handle.abort_handle()), + }, + #[cfg(feature = "simulation")] + Self::Simulation(handle) => AbortHandle { + inner: AbortHandleInner::Simulation(handle.abort_handle()), + }, + Self::Detached(_) => unreachable!("abort_handle called on a completed handle"), + } + } + + fn poll_result(&mut self, cx: &mut Context<'_>) -> Poll> { + match self { + #[cfg(feature = "tokio")] + Self::Tokio(handle) => match Pin::new(handle).poll(cx) { + Poll::Ready(Ok(output)) => Poll::Ready(Ok(output)), + Poll::Ready(Err(err)) => Poll::Ready(Err(JoinError { + inner: JoinErrorInner::Tokio(err), + })), + Poll::Pending => Poll::Pending, + }, + #[cfg(feature = "simulation")] + Self::Simulation(handle) => match Pin::new(handle).poll_join(cx) { + Poll::Ready(Ok(output)) => Poll::Ready(Ok(output)), + Poll::Ready(Err(err)) => Poll::Ready(Err(JoinError { + inner: JoinErrorInner::Simulation(err), + })), + Poll::Pending => Poll::Pending, + }, + Self::Detached(_) => unreachable!("poll_result called on a completed handle"), + } + } +} + +impl JoinHandle { + pub fn abort_handle(&self) -> AbortHandle { + self.inner.abort_handle() + } +} + +impl Future for JoinHandle { + type Output = Result; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + #[cfg(not(any(feature = "tokio", feature = "simulation")))] + let _ = cx; + match self.inner.poll_result(cx) { + Poll::Ready(Ok(output)) => { + self.inner = JoinHandleInner::Detached(PhantomData); + Poll::Ready(Ok(output)) + } + Poll::Ready(Err(err)) => Poll::Ready(Err(err)), + Poll::Pending => Poll::Pending, + } + } +} + +impl Drop for JoinHandle { + fn drop(&mut self) { + let inner = core::mem::replace(&mut self.inner, JoinHandleInner::Detached(PhantomData)); + #[cfg(feature = "simulation")] + if let JoinHandleInner::Simulation(handle) = inner { + handle.detach(); + return; + } + // For Tokio (and Detached), dropping the handle does not cancel the task. + drop(inner); + } +} + +impl Unpin for JoinHandle {} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct RuntimeTimeout; + +impl fmt::Display for RuntimeTimeout { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("runtime operation timed out") + } +} + +#[cfg(any(feature = "tokio", feature = "simulation"))] +impl std::error::Error for RuntimeTimeout {} + +#[cfg(feature = "tokio")] +impl Handle { + pub fn tokio(handle: TokioHandle) -> Self { + Self::Tokio(handle) + } + + pub fn tokio_current() -> Self { + Self::tokio(TokioHandle::current()) + } +} + +#[cfg(feature = "simulation")] +impl Handle { + pub fn simulation(handle: sim::Handle) -> Self { + Self::Simulation(handle) + } +} + +impl Handle { + pub fn spawn(&self, future: impl Future + Send + 'static) -> JoinHandle { + #[cfg(not(any(feature = "tokio", feature = "simulation")))] + let _ = future; + match self { + #[cfg(feature = "tokio")] + Self::Tokio(handle) => JoinHandle { + inner: JoinHandleInner::Tokio(handle.spawn(future)), + }, + #[cfg(feature = "simulation")] + Self::Simulation(handle) => JoinHandle { + inner: JoinHandleInner::Simulation(handle.spawn_on(sim::NodeId::MAIN, future)), + }, + #[cfg(not(any(feature = "tokio", feature = "simulation")))] + _ => unreachable!("runtime dispatch has no enabled backend"), + } + } + + pub async fn spawn_blocking(&self, f: F) -> R + where + F: FnOnce() -> R + Send + 'static, + R: Send + 'static, + { + #[cfg(not(any(feature = "tokio", feature = "simulation")))] + let _ = &f; + match self { + #[cfg(feature = "tokio")] + Self::Tokio(_) => tokio::task::spawn_blocking(f) + .await + .unwrap_or_else(|e| match e.try_into_panic() { + Ok(panic_payload) => std::panic::resume_unwind(panic_payload), + Err(e) => panic!("Unexpected JoinError: {e}"), + }), + // This is only a facade placeholder for simulation today. It + // delegates to a normal simulated task, so the closure still runs + // on the single executor thread and can block overall runtime + // progress. Callers should not expect blocking-pool semantics on + // the simulation backend. + #[cfg(feature = "simulation")] + Self::Simulation(handle) => handle + .spawn_on(sim::NodeId::MAIN, async move { f() }) + .await + .expect("simulation spawn_blocking task should not be cancelled"), + #[cfg(not(any(feature = "tokio", feature = "simulation")))] + _ => unreachable!("runtime dispatch has no enabled backend"), + } + } + + pub async fn timeout( + &self, + timeout_after: Duration, + future: impl Future, + ) -> Result { + #[cfg(not(any(feature = "tokio", feature = "simulation")))] + let _ = (timeout_after, future); + match self { + #[cfg(feature = "tokio")] + Self::Tokio(_) => tokio::time::timeout(timeout_after, future) + .await + .map_err(|_| RuntimeTimeout), + #[cfg(feature = "simulation")] + Self::Simulation(handle) => handle.timeout(timeout_after, future).await.map_err(|_| RuntimeTimeout), + #[cfg(not(any(feature = "tokio", feature = "simulation")))] + _ => unreachable!("runtime dispatch has no enabled backend"), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }; + + #[cfg(feature = "simulation")] + #[test] + fn dropping_joinhandle_does_not_cancel_task_in_simulation() { + use crate::sim::Runtime; + let mut rt = Runtime::new(4); + let handle = Handle::simulation(rt.handle()); + let flag = Arc::new(AtomicBool::new(false)); + let flag_clone = flag.clone(); + + rt.block_on(async { + let jh = handle.spawn(async move { + flag_clone.store(true, Ordering::Release); + }); + drop(jh); + + // Yield so the spawned task gets polled. + handle + .timeout(std::time::Duration::from_millis(50), async {}) + .await + .ok(); + }); + + assert!(flag.load(Ordering::Acquire)); + } + + #[cfg(feature = "simulation")] + #[test] + fn abort_cancels_task_in_simulation() { + use crate::sim::Runtime; + let mut rt = Runtime::new(4); + let handle = Handle::simulation(rt.handle()); + let flag = Arc::new(AtomicBool::new(false)); + let flag_clone = flag.clone(); + let handle_for_spawn = handle.clone(); + + rt.block_on(async move { + let jh = handle.spawn(async move { + handle_for_spawn + .timeout(std::time::Duration::from_millis(100), async {}) + .await + .ok(); + flag_clone.store(true, Ordering::Release); + }); + jh.abort_handle().abort(); + + let result = jh.await; + let _ = handle.timeout(std::time::Duration::from_millis(500), async {}).await; + assert!(result.is_err()); + assert!(!flag.load(Ordering::Acquire)); + }); + } +} diff --git a/crates/runtime/src/sim/buggify.rs b/crates/runtime/src/sim/buggify.rs new file mode 100644 index 00000000000..07188c6c207 --- /dev/null +++ b/crates/runtime/src/sim/buggify.rs @@ -0,0 +1,51 @@ +use crate::sim::Runtime; + +/// Probabilistic fault-injection helpers for simulation code. +/// +/// Reference: . +/// +/// Buggify is tied to a specific simulation runtime. Callers toggle it on that +/// runtime, then ask whether a fault should be injected at a particular point. +pub fn enable(runtime: &Runtime) { + runtime.enable_buggify(); +} + +/// Disable probabilistic fault injection for the given simulation runtime. +pub fn disable(runtime: &Runtime) { + runtime.disable_buggify(); +} + +/// Returns whether buggify is enabled for the given simulation runtime. +pub fn is_enabled(runtime: &Runtime) -> bool { + runtime.is_buggify_enabled() +} + +/// Returns whether the runtime should inject a fault at this point using the +/// default deterministic probability. +pub fn should_inject_fault(runtime: &Runtime) -> bool { + runtime.buggify() +} + +/// Returns whether the runtime should inject a fault at this point using the +/// provided deterministic probability. +pub fn should_inject_fault_with_prob(runtime: &Runtime, probability: f64) -> bool { + runtime.buggify_with_prob(probability) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn runtime_owned_buggify_controls_fault_injection() { + let runtime = Runtime::new(7); + + assert!(!is_enabled(&runtime)); + enable(&runtime); + assert!(is_enabled(&runtime)); + assert!(should_inject_fault_with_prob(&runtime, 1.0)); + disable(&runtime); + assert!(!is_enabled(&runtime)); + assert!(!should_inject_fault_with_prob(&runtime, 1.0)); + } +} diff --git a/crates/runtime/src/sim/executor/mod.rs b/crates/runtime/src/sim/executor/mod.rs new file mode 100644 index 00000000000..0b874be8afe --- /dev/null +++ b/crates/runtime/src/sim/executor/mod.rs @@ -0,0 +1,793 @@ +use alloc::{collections::BTreeMap, sync::Arc, vec::Vec}; +use core::{ + fmt, + future::Future, + pin::Pin, + sync::atomic::{AtomicBool, AtomicU64, Ordering}, + task::{Context, Poll, Waker}, + time::Duration, +}; + +use spin::Mutex; + +use crate::sim::{time::TimeHandle, Rng}; + +mod task; +pub use task::{AbortHandle, JoinError, JoinHandle}; +use task::Abortable; + +type Runnable = async_task::Runnable; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct RuntimeConfig { + pub seed: u64, +} + +impl RuntimeConfig { + pub const fn new(seed: u64) -> Self { + Self { seed } + } +} + +impl Default for RuntimeConfig { + fn default() -> Self { + Self::new(0) + } +} + +/// A unique identifier for a simulated node. +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct NodeId(u64); + +impl NodeId { + /// The default node for single-node simulation or top-level runtime work. + pub const MAIN: Self = Self(0); +} + +impl fmt::Display for NodeId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.0.fmt(f) + } +} + +/// Immutable metadata attached to one simulated node. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +struct NodeConfig { + name: Option, +} + +/// Builder for configuring a simulated node before it is created. +pub struct NodeBuilder { + handle: Handle, + config: NodeConfig, +} + +impl NodeBuilder { + /// Assign a human-readable name to the node. + pub fn name(mut self, name: impl Into) -> Self { + self.config.name = Some(name.into()); + self + } + + /// Create the node with the accumulated configuration. + pub fn build(self) -> Node { + self.handle.build_node(self.config) + } +} + +/// Handle to one simulated node in the runtime. +#[derive(Clone)] +pub struct Node { + id: NodeId, + handle: Handle, + config: Arc, +} + +impl Node { + /// Return the stable identifier for this simulated node. + pub fn id(&self) -> NodeId { + self.id + } + + /// Return the optional human-readable name for this node. + pub fn name(&self) -> Option<&str> { + self.config.name.as_deref() + } + + /// Pause scheduling for this node. + pub fn pause(&self) { + self.handle.pause(self.id); + } + + /// Resume scheduling for this node. + pub fn resume(&self) { + self.handle.resume(self.id); + } + + /// Spawn a `Send` future onto this simulated node. + pub fn spawn(&self, future: F) -> JoinHandle + where + F: Future + Send + 'static, + F::Output: Send + 'static, + { + self.handle.spawn_on(self.id, future) + } + + /// Spawn a non-`Send` future onto this simulated node. + pub fn spawn_local(&self, future: F) -> JoinHandle + where + F: Future + 'static, + F::Output: 'static, + { + self.handle.spawn_local_on(self.id, future) + } +} + +/// A small single-threaded runtime for DST's top-level future. +/// +/// futures are scheduled as runnables, the ready queue +/// is sampled by deterministic RNG, and pending execution without future events +/// is considered a test hang. +pub struct Runtime { + executor: Arc, +} + +impl Runtime { + /// Create a simulation runtime seeded for deterministic scheduling and RNG. + pub fn new(seed: u64) -> Self { + Self::with_config(RuntimeConfig::new(seed)) + } + + /// Create a simulation runtime from an explicit runtime configuration. + pub fn with_config(config: RuntimeConfig) -> Self { + Self { + executor: Arc::new(Executor::new(config)), + } + } + + /// Drive a top-level future to completion on the simulation executor. + /// + /// While the future runs, spawned tasks share the same deterministic + /// scheduler, timer wheel, and runtime RNG. + pub fn block_on(&mut self, future: F) -> F::Output { + self.executor.block_on(future) + } + + /// Return the amount of virtual time elapsed in this runtime. + pub fn elapsed(&self) -> Duration { + self.executor.elapsed() + } + + /// Get a cloneable handle for spawning tasks and accessing runtime services. + pub fn handle(&self) -> Handle { + Handle { + executor: Arc::clone(&self.executor), + } + } + + /// Create a new simulated node. + /// + /// Nodes are a scheduling/pausing boundary rather than separate executors: + /// all nodes still run on the same single-threaded runtime. + pub fn create_node(&self) -> NodeBuilder { + self.handle().create_node() + } + + /// Pause scheduling for a node. + /// + /// Tasks already queued for the node are retained and will run only after + /// the node is resumed. + pub fn pause(&self, node: NodeId) { + self.handle().pause(node); + } + + /// Resume scheduling for a previously paused node. + pub fn resume(&self, node: NodeId) { + self.handle().resume(node); + } + + /// Spawn a `Send` future onto a specific simulated node. + pub fn spawn_on(&self, node: NodeId, future: F) -> JoinHandle + where + F: Future + Send + 'static, + F::Output: Send + 'static, + { + self.handle().spawn_on(node, future) + } + + pub fn enable_buggify(&self) { + self.executor.enable_buggify(); + } + + /// Disable probabilistic fault injection for this runtime. + pub fn disable_buggify(&self) { + self.executor.disable_buggify(); + } + + /// Return whether buggify is enabled for this runtime. + pub fn is_buggify_enabled(&self) -> bool { + self.executor.is_buggify_enabled() + } + + /// Sample the default runtime buggify probability. + pub fn buggify(&self) -> bool { + self.executor.buggify() + } + + /// Sample a caller-provided runtime buggify probability. + pub fn buggify_with_prob(&self, probability: f64) -> bool { + self.executor.buggify_with_prob(probability) + } + + #[allow(dead_code)] + pub(crate) fn enable_determinism_log(&self) { + self.executor.rng.enable_determinism_log(); + } + + #[allow(dead_code)] + pub(crate) fn enable_determinism_check(&self, log: crate::sim::DeterminismLog) { + self.executor.rng.enable_determinism_check(log); + } + + #[allow(dead_code)] + pub(crate) fn take_determinism_log(&self) -> Option { + self.executor.rng.take_determinism_log() + } + + #[allow(dead_code)] + pub(crate) fn finish_determinism_check(&self) -> Result<(), alloc::string::String> { + self.executor.rng.finish_determinism_check() + } +} + +/// Cloneable access to the simulation executor. +#[derive(Clone)] +pub struct Handle { + executor: Arc, +} + +impl Handle { + /// Create a new simulated node owned by this runtime. + pub fn create_node(&self) -> NodeBuilder { + NodeBuilder { + handle: self.clone(), + config: NodeConfig::default(), + } + } + + fn build_node(&self, config: NodeConfig) -> Node { + let id = self.executor.create_node(config.clone()); + let config = self.executor.node_config(id); + Node { + id, + handle: self.clone(), + config, + } + } + + /// Pause scheduling for a node. + pub fn pause(&self, node: NodeId) { + self.executor.pause(node); + } + + /// Resume scheduling for a node and requeue any buffered tasks for it. + pub fn resume(&self, node: NodeId) { + self.executor.resume(node); + } + + /// Spawn a `Send` future onto a specific simulated node. + pub fn spawn_on(&self, node: NodeId, future: F) -> JoinHandle + where + F: Future + Send + 'static, + F::Output: Send + 'static, + { + self.executor.spawn_on(node, future) + } + + /// Spawn a non-`Send` future onto a specific simulated node. + /// + /// This is only valid because the simulation executor is single-threaded. + pub fn spawn_local_on(&self, node: NodeId, future: F) -> JoinHandle + where + F: Future + 'static, + F::Output: 'static, + { + self.executor.spawn_local_on(node, future) + } + + /// Return the current virtual time for this runtime. + pub fn now(&self) -> Duration { + self.executor.time.now() + } + + /// Move virtual time forward explicitly. + pub fn advance(&self, duration: Duration) { + self.executor.time.advance(duration); + } + + /// Create a future that becomes ready after `duration` of virtual time. + pub fn sleep(&self, duration: Duration) -> crate::sim::time::Sleep { + self.executor.time.sleep(duration) + } + + /// Race a future against a virtual-time timeout. + pub async fn timeout( + &self, + duration: Duration, + future: impl Future, + ) -> Result { + self.executor.time.timeout(duration, future).await + } + + pub fn enable_buggify(&self) { + self.executor.enable_buggify(); + } + + /// Disable probabilistic fault injection for this runtime. + pub fn disable_buggify(&self) { + self.executor.disable_buggify(); + } + + /// Return whether buggify is enabled for this runtime. + pub fn is_buggify_enabled(&self) -> bool { + self.executor.is_buggify_enabled() + } + + /// Sample the default runtime buggify probability. + pub fn buggify(&self) -> bool { + self.executor.buggify() + } + + /// Sample a caller-provided runtime buggify probability. + pub fn buggify_with_prob(&self, probability: f64) -> bool { + self.executor.buggify_with_prob(probability) + } +} + +/// Core single-threaded scheduler backing a simulation [`Runtime`]. +/// +/// The executor owns the runnable queue, per-node pause state, deterministic +/// RNG, and virtual time. Tasks are selected from the queue using the runtime +/// RNG so the schedule is reproducible for a given seed. +struct Executor { + queue: Receiver, + sender: Sender, + nodes: spin::Mutex>>, + next_node: AtomicU64, + rng: Rng, + time: TimeHandle, +} + +impl Executor { + /// Construct a fresh executor with one default `MAIN` node. + fn new(config: RuntimeConfig) -> Self { + let queue = Queue::new(); + let mut nodes = BTreeMap::new(); + nodes.insert(NodeId::MAIN, Arc::new(NodeRecord::default())); + Self { + queue: queue.receiver(), + sender: queue.sender(), + nodes: spin::Mutex::new(nodes), + next_node: AtomicU64::new(1), + rng: Rng::new(config.seed), + time: TimeHandle::new(), + } + } + + fn elapsed(&self) -> Duration { + self.time.now() + } + + fn enable_buggify(&self) { + self.rng.enable_buggify(); + } + + fn disable_buggify(&self) { + self.rng.disable_buggify(); + } + + fn is_buggify_enabled(&self) -> bool { + self.rng.is_buggify_enabled() + } + + fn buggify(&self) -> bool { + self.rng.buggify() + } + + fn buggify_with_prob(&self, probability: f64) -> bool { + self.rng.buggify_with_prob(probability) + } + + fn create_node(&self, config: NodeConfig) -> NodeId { + let id = NodeId(self.next_node.fetch_add(1, Ordering::Relaxed)); + self.nodes.lock().insert( + id, + Arc::new(NodeRecord { + config: Arc::new(config), + state: NodeState::default(), + }), + ); + id + } + + fn node_config(&self, node: NodeId) -> Arc { + self.node_record(node).config.clone() + } + + /// Mark a node as paused so newly selected runnables are buffered. + fn pause(&self, node: NodeId) { + self.node_record(node).state.paused.store(true, Ordering::Relaxed); + } + + /// Mark a node as runnable again and requeue any buffered tasks for it. + fn resume(&self, node: NodeId) { + let record = self.node_record(node); + record.state.paused.store(false, Ordering::Relaxed); + + let mut paused = record.state.paused_queue.lock(); + for runnable in paused.drain(..) { + self.sender.send(runnable); + } + } + + /// Spawn a `Send` task and enqueue its runnable on the shared runtime queue. + fn spawn_on(&self, node: NodeId, future: F) -> JoinHandle + where + F: Future + Send + 'static, + F::Output: Send + 'static, + { + self.assert_known_node(node); + + let abort = AbortHandle::new(); + let abortable = Abortable::new(future, abort.clone()); + let sender = self.sender.clone(); + let (runnable, task) = async_task::Builder::new() + .metadata(node) + .spawn(move |_| abortable, move |runnable| sender.send(runnable)); + runnable.schedule(); + + JoinHandle { task, abort } + } + + /// Spawn a non-`Send` task on the single-threaded runtime. + fn spawn_local_on(&self, node: NodeId, future: F) -> JoinHandle + where + F: Future + 'static, + F::Output: 'static, + { + self.assert_known_node(node); + + let abort = AbortHandle::new(); + let abortable = Abortable::new(future, abort.clone()); + let sender = self.sender.clone(); + let (runnable, task) = unsafe { + async_task::Builder::new() + .metadata(node) + .spawn_unchecked(move |_| abortable, move |runnable| sender.send(runnable)) + }; + runnable.schedule(); + + JoinHandle { task, abort } + } + + #[track_caller] + /// Run the top-level future until completion. + /// + /// The executor repeatedly drains runnable tasks, then advances virtual + /// time to the next timer when the queue is empty. If neither runnable work + /// nor timers remain, the simulation is considered deadlocked. + fn block_on(&self, future: F) -> F::Output { + let sender = self.sender.clone(); + let (runnable, mut task) = unsafe { + async_task::Builder::new() + .metadata(NodeId::MAIN) + .spawn_unchecked(move |_| future, move |runnable| sender.send(runnable)) + }; + runnable.schedule(); + + loop { + self.run_all_ready(); + if task.is_finished() { + let waker = Waker::noop(); + return match Pin::new(&mut task).poll(&mut Context::from_waker(&waker)) { + Poll::Ready(output) => output, + Poll::Pending => unreachable!("task.is_finished() was true"), + }; + } + + if self.time.wake_next_timer() { + continue; + } + + panic!("no runnable tasks; all simulated tasks are blocked"); + } + } + + /// Drain the runnable queue, selecting tasks in deterministic RNG order. + /// + /// Paused-node tasks are diverted into that node's paused buffer instead of + /// being polled immediately. + fn run_all_ready(&self) { + while let Some(runnable) = self.queue.try_recv_random(&self.rng) { + let node = *runnable.metadata(); + let record = self.node_record(node); + if record.state.paused.load(Ordering::Relaxed) { + record.state.paused_queue.lock().push(runnable); + continue; + } + // TODO: Do some time advance here too + runnable.run(); + } + } + + /// Look up the record for a node, panicking if the node is unknown. + fn node_record(&self, node: NodeId) -> Arc { + self.nodes + .lock() + .get(&node) + .cloned() + .unwrap_or_else(|| panic!("unknown simulated node {node}")) + } + + fn assert_known_node(&self, node: NodeId) { + let _ = self.node_record(node); + } +} + +/// One simulated node's immutable metadata plus scheduler state. +#[derive(Clone, Default)] +struct NodeRecord { + config: Arc, + state: NodeState, +} + +/// Per-node scheduler state shared by tasks assigned to that node. +#[derive(Clone, Default)] +struct NodeState { + paused: Arc, + paused_queue: Arc>>, +} + +/// Yield back to the scheduler once. +/// +/// This is the smallest explicit interleaving point available to simulated +/// tasks when they need to give other runnables a chance to execute. +pub async fn yield_now() { + YieldNow { yielded: false }.await +} + +/// One-shot future backing [`yield_now`]. +struct YieldNow { + yielded: bool, +} + +impl Future for YieldNow { + type Output = (); + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + if self.yielded { + Poll::Ready(()) + } else { + self.yielded = true; + cx.waker().wake_by_ref(); + Poll::Pending + } + } +} + +/// Shared runnable queue used by the simulation executor. +/// TODO: Make it generic over T +struct Queue { + inner: Arc, +} + +/// Sending end of the runnable queue. +#[derive(Clone)] +struct Sender { + inner: Arc, +} + +/// Receiving end of the runnable queue. +#[derive(Clone)] +struct Receiver { + inner: Arc, +} + +/// Queue storage for runnables awaiting scheduling. +struct QueueInner { + queue: Mutex>, +} + +impl Queue { + fn new() -> Self { + Self { + inner: Arc::new(QueueInner { + queue: Mutex::new(Vec::new()), + }), + } + } + + fn sender(&self) -> Sender { + Sender { + inner: self.inner.clone(), + } + } + + fn receiver(&self) -> Receiver { + Receiver { + inner: self.inner.clone(), + } + } +} + +impl Sender { + /// Push a runnable onto the shared queue. + fn send(&self, runnable: Runnable) { + self.inner.queue.lock().push(runnable); + } +} + +impl Receiver { + /// Remove one runnable using the runtime RNG to choose among ready tasks. + fn try_recv_random(&self, rng: &Rng) -> Option { + let mut queue = self.inner.queue.lock(); + if queue.is_empty() { + return None; + } + let idx = rng.index(queue.len()); + Some(queue.swap_remove(idx)) + } +} + +#[cfg(test)] +mod tests { + use std::sync::{ + atomic::{AtomicBool, AtomicUsize, Ordering}, + Arc, + }; + + use super::*; + use crate::sim::RuntimeConfig; + + #[test] + fn paused_node_does_not_run_until_resumed() { + let mut runtime = Runtime::new(1); + let node = runtime.create_node().name("paused").build(); + node.pause(); + + let runs = Arc::new(AtomicUsize::new(0)); + let task_runs = Arc::clone(&runs); + let task = node.spawn(async move { + task_runs.fetch_add(1, Ordering::SeqCst); + 7 + }); + + runtime.block_on(async { + yield_now().await; + }); + assert_eq!(runs.load(Ordering::SeqCst), 0); + + node.resume(); + assert_eq!(runtime.block_on(task).expect("paused task should complete"), 7); + assert_eq!(runs.load(Ordering::SeqCst), 1); + } + + #[test] + fn handle_can_spawn_onto_node_from_simulated_task() { + let mut runtime = Runtime::new(2); + let handle = runtime.handle(); + + let value = runtime.block_on(async move { + let node = handle.create_node().name("spawned").build(); + node.spawn(async { 11 }).await.expect("spawned task should complete") + }); + + assert_eq!(value, 11); + } + + #[test] + fn runtime_config_sets_seed() { + let runtime = Runtime::with_config(RuntimeConfig::new(77)); + let handle = runtime.handle(); + handle.enable_buggify(); + + let actual = (0..8).map(|_| handle.buggify_with_prob(0.5)).collect::>(); + + let expected = { + let rng = Rng::new(77); + rng.enable_buggify(); + (0..8).map(|_| rng.buggify_with_prob(0.5)).collect::>() + }; + + assert_eq!(actual, expected); + } + + #[test] + fn runtime_and_handle_share_buggify_state() { + let runtime = Runtime::new(6); + let handle = runtime.handle(); + + assert!(!runtime.is_buggify_enabled()); + runtime.enable_buggify(); + assert!(handle.is_buggify_enabled()); + assert!(handle.buggify_with_prob(1.0)); + handle.disable_buggify(); + assert!(!runtime.is_buggify_enabled()); + } + + #[test] + fn aborted_task_returns_join_error_when_awaited() { + let mut runtime = Runtime::new(8); + let node = runtime.create_node().name("abort").build(); + let task = node.spawn(async move { + yield_now().await; + 99 + }); + task.abort_handle().abort(); + + let err = runtime + .block_on(task) + .expect_err("aborted task should surface JoinError instead of panicking"); + assert_eq!(err, JoinError); + } + + #[cfg(feature = "simulation")] + #[test] + fn sim_std_block_on_can_spawn_local_task_with_explicit_handle() { + let mut runtime = Runtime::new(5); + let handle = runtime.handle(); + let node = handle.create_node().name("local").build(); + let value = crate::sim_std::block_on(&mut runtime, async move { + let captured = std::rc::Rc::new(17); + node.spawn_local(async move { + yield_now().await; + *captured + }) + .await + .expect("spawned local task should complete") + }); + + assert_eq!(value, 17); + } + + #[test] + fn node_builder_sets_name() { + let runtime = Runtime::new(9); + let unnamed = runtime.create_node().build(); + let named = runtime.create_node().name("replica-1").build(); + + assert_eq!(unnamed.name(), None); + assert_eq!(named.name(), Some("replica-1")); + assert_ne!(unnamed.id(), named.id()); + } + + #[cfg(feature = "simulation")] + #[test] + fn check_determinism_runs_future_twice() { + static CALLS: AtomicUsize = AtomicUsize::new(0); + CALLS.store(0, Ordering::SeqCst); + + let value = crate::sim_std::check_determinism(3, || async { + CALLS.fetch_add(1, Ordering::SeqCst); + yield_now().await; + 13 + }); + + assert_eq!(value, 13); + assert_eq!(CALLS.load(Ordering::SeqCst), 2); + } + + #[cfg(feature = "simulation")] + #[test] + #[should_panic(expected = "non-determinism detected")] + fn check_determinism_rejects_different_scheduler_sequence() { + static FIRST_RUN: AtomicBool = AtomicBool::new(true); + FIRST_RUN.store(true, Ordering::SeqCst); + + crate::sim_std::check_determinism(4, || async { + if FIRST_RUN.swap(false, Ordering::SeqCst) { + yield_now().await; + } + }); + } +} diff --git a/crates/runtime/src/sim/executor/task.rs b/crates/runtime/src/sim/executor/task.rs new file mode 100644 index 00000000000..d98ad3d8348 --- /dev/null +++ b/crates/runtime/src/sim/executor/task.rs @@ -0,0 +1,162 @@ +use alloc::sync::Arc; +use core::{ + fmt, + future::Future, + pin::Pin, + sync::atomic::{AtomicBool, Ordering}, + task::{Context, Poll, Waker}, +}; + +use spin::Mutex; + +use super::NodeId; + +/// A spawned simulated task. +/// +/// Two handles reference the same underlying allocation: +/// - `JoinHandle` awaits the output and holds an `AbortHandle` for cancellation. +/// - The executor holds the `Runnable` (not visible here). +pub struct JoinHandle { + // async_task::Task owns a shared heap-allocated cell that holds the future, + // its output, metadata (NodeId), and waker. Polling it drives the future + // to completion. Dropping it without detach cancels the future. + pub(crate) task: async_task::Task, NodeId>, + // Clone of the same AbortHandle that Abortable holds inside the task. + pub(crate) abort: AbortHandle, +} + +impl JoinHandle { + /// Return a handle that can cancel this task. + pub fn abort_handle(&self) -> AbortHandle { + self.abort.clone() + } + + /// Drop the join handle without cancelling the task. + pub fn detach(self) { + // async_task::Task::detach makes Drop a no-op — the future keeps running. + self.task.detach(); + } + + /// Poll the underlying async_task::Task for its output. + pub(crate) fn poll_join( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + // async_task::Task implements Future. Polling it drives the wrapped + // Abortable future inside the executor. + Pin::new(&mut self.task).poll(cx) + } +} + +impl Future for JoinHandle { + type Output = Result; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + self.as_mut().poll_join(cx) + } +} + +/// Two-phase cancellation for a simulated task. +/// +/// [`AbortHandle`] and [`Abortable`] work together: +/// - `abort()` sets an atomic flag and wakes the task so it gets polled. +/// - On the next poll, `Abortable` checks the flag and returns `Err(JoinError)`. +/// - `JoinHandle::poll` reads that error and surfaces it to the awaiting code. +/// - The task's future is dropped naturally when `Abortable` returns `Err`. +/// +/// `abort()` is thread-safe — it can be called from any task or node, and the +/// waker ensures the target task is re-scheduled even if it was blocked on I/O +/// or a timer. +#[derive(Clone)] +pub struct AbortHandle { + state: Arc, +} + +impl AbortHandle { + pub(crate) fn new() -> Self { + Self { + state: Arc::new(AbortState::new()), + } + } + + pub fn abort(&self) { + // Step 1: atomically mark the task as aborted. + self.state.aborted.store(true, Ordering::Relaxed); + // Step 2: wake the task so the executor re-schedules it for polling. + // If the task is blocked on a timer, the waker cancels that wait. + if let Some(waker) = self.state.waker.lock().take() { + waker.wake(); + } + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct JoinError; + +impl fmt::Display for JoinError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("task was cancelled") + } +} + +#[cfg(feature = "simulation")] +impl std::error::Error for JoinError {} + +// Shared state between AbortHandle and Abortable. +struct AbortState { + // Set to true by AbortHandle::abort(), read by Abortable::poll(). + aborted: AtomicBool, + // The executor's waker, registered by Abortable on every poll. + // Stored so abort() can wake the task even if it's waiting on I/O. + waker: Mutex>, +} + +impl AbortState { + fn new() -> Self { + Self { + aborted: AtomicBool::new(false), + waker: Mutex::new(None), + } + } +} + +/// Wraps a future so it can be cancelled via an [`AbortHandle`]. +/// +/// The executor wraps every spawned future in `Abortable`. On each poll it +/// checks the cancellation flag before progressing the inner future. +pub(crate) struct Abortable { + future: F, + abort: AbortHandle, +} + +impl Abortable { + pub(crate) fn new(future: F, abort: AbortHandle) -> Self { + Self { future, abort } + } +} + +impl Future for Abortable { + type Output = Result; + + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + // Check cancellation before doing any work. + if self.abort.state.aborted.load(Ordering::Relaxed) { + return Poll::Ready(Err(JoinError)); + } + + // Register the waker so abort() can wake this task. + self.abort.state.waker.lock().replace(cx.waker().clone()); + + // SAFETY: The `Abortable` struct is `#[repr(transparent)]`-like in its + // pin projection: `future` is behind the cancellation fields (`abort`) + // that are never moved once pinned. We use `map_unchecked_mut` to project + // through the struct layout, which is safe because: + // 1. `future` is a direct field of `Abortable` — no indirection. + // 2. `abort` is never moved or modified in ways that would change the + // address of `future` relative to `self`. + // 3. The caller guarantees `self` stays pinned for the lifetime of the + // future. + let mut future = unsafe { self.map_unchecked_mut(|this| &mut this.future) }; + future.as_mut().poll(cx).map(Ok) + } +} diff --git a/crates/runtime/src/sim/mod.rs b/crates/runtime/src/sim/mod.rs new file mode 100644 index 00000000000..ccdcc104991 --- /dev/null +++ b/crates/runtime/src/sim/mod.rs @@ -0,0 +1,10 @@ +pub mod buggify; +mod executor; +mod rng; +pub mod time; + +pub use executor::{ + yield_now, AbortHandle, Handle, JoinError, JoinHandle, Node, NodeBuilder, NodeId, Runtime, RuntimeConfig, +}; +pub(crate) use rng::DeterminismLog; +pub use rng::{GlobalRng, Rng}; diff --git a/crates/runtime/src/sim/rng.rs b/crates/runtime/src/sim/rng.rs new file mode 100644 index 00000000000..b39219290dd --- /dev/null +++ b/crates/runtime/src/sim/rng.rs @@ -0,0 +1,202 @@ +use alloc::{format, string::String}; +use alloc::{sync::Arc, vec::Vec}; +use spin::Mutex; + +pub type Rng = GlobalRng; + +/// Shared deterministic RNG for the simulation core. +/// +/// The simulator owns one runtime-wide RNG handle and uses it for scheduler +/// choices, probabilistic fault injection, and determinism checks. Hosted +/// conveniences such as thread-local current-RNG access and libc random hooks +/// live in `crate::sim_std`, not here. +#[derive(Clone, Debug)] +pub struct GlobalRng { + inner: Arc>, +} + +#[derive(Debug)] +struct Inner { + /// Seed used to initialize the runtime RNG, carried for diagnostics and replay. + seed: u64, + /// Deterministic generator used for scheduler choices and fault injection decisions. + rng: SplitMix64, + /// Checkpoints recorded during the first determinism run. + log: Option>, + /// Expected checkpoints plus the number already consumed during replay. + check: Option<(Vec, usize)>, + /// Whether probabilistic fault injection is currently enabled for this runtime. + buggify_enabled: bool, +} + +const GAMMA: u64 = 0x9e37_79b9_7f4a_7c15; + +/// Reference for SplitMix64 algorithm: https://rosettacode.org/wiki/Pseudo-random_numbers/Splitmix64 +/// Splitmix64 is the default pseudo-random number generator algorithm. +/// It uses a fairly simple algorithm that, though it is considered +/// to be poor for cryptographic purposes, is very fast to calculate, +/// and is "good enough" for many random number needs. +/// It passes several fairly rigorous PRNG "fitness" tests that some more complex algorithms fail. +#[derive(Clone, Debug)] +struct SplitMix64 { + state: u64, +} + +impl SplitMix64 { + fn new(seed: u64) -> Self { + Self { state: seed } + } + + fn next_u64(&mut self) -> u64 { + self.state = self.state.wrapping_add(GAMMA); + mix64(self.state) + } + + fn fill_bytes(&mut self, dest: &mut [u8]) { + for chunk in dest.chunks_mut(core::mem::size_of::()) { + let bytes = self.next_u64().to_ne_bytes(); + chunk.copy_from_slice(&bytes[..chunk.len()]); + } + } +} + +fn mix64(mut x: u64) -> u64 { + x = (x ^ (x >> 30)).wrapping_mul(0xbf58_476d_1ce4_e5b9); + x = (x ^ (x >> 27)).wrapping_mul(0x94d0_49bb_1331_11eb); + x ^ (x >> 31) +} + +impl GlobalRng { + /// Create a new deterministic RNG for a simulation runtime. + pub fn new(seed: u64) -> Self { + Self { + inner: Arc::new(Mutex::new(Inner { + seed, + rng: SplitMix64::new(seed), + log: None, + check: None, + buggify_enabled: false, + })), + } + } + + pub fn next_u64(&self) -> u64 { + self.with_inner(|inner| inner.rng.next_u64()) + } + + pub fn index(&self, len: usize) -> usize { + assert!(len > 0, "len must be non-zero"); + (self.next_u64() as usize) % len + } + + pub fn sample_probability(&self, probability: f64) -> bool { + probability_sample(self.next_u64(), probability) + } + + pub fn enable_buggify(&self) { + self.inner.lock().buggify_enabled = true; + } + + pub fn disable_buggify(&self) { + self.inner.lock().buggify_enabled = false; + } + + pub fn is_buggify_enabled(&self) -> bool { + self.inner.lock().buggify_enabled + } + + pub fn buggify(&self) -> bool { + self.buggify_with_prob(0.25) + } + + pub fn buggify_with_prob(&self, probability: f64) -> bool { + self.is_buggify_enabled() && self.sample_probability(probability) + } + + #[allow(dead_code)] + pub(crate) fn seed(&self) -> u64 { + self.inner.lock().seed + } + + fn with_inner(&self, f: impl FnOnce(&mut Inner) -> T) -> T { + let mut inner = self.inner.lock(); + let output = f(&mut inner); + if inner.log.is_some() || inner.check.is_some() { + let checkpoint = checksum(inner.rng.clone().next_u64()); + if let Some(log) = &mut inner.log { + log.push(checkpoint); + } + let seed = inner.seed; + if let Some((expected, consumed)) = &mut inner.check { + if expected.get(*consumed) != Some(&checkpoint) { + panic!("non-determinism detected for seed {} at checkpoint {consumed}", seed); + } + *consumed += 1; + } + } + output + } + + #[allow(dead_code)] + pub(crate) fn fill_bytes(&self, dest: &mut [u8]) { + self.with_inner(|inner| inner.rng.fill_bytes(dest)); + } + + #[allow(dead_code)] + pub(crate) fn enable_determinism_log(&self) { + let mut inner = self.inner.lock(); + inner.log = Some(Vec::new()); + inner.check = None; + } + + #[allow(dead_code)] + pub(crate) fn enable_determinism_check(&self, log: DeterminismLog) { + let mut inner = self.inner.lock(); + inner.check = Some((log.0, 0)); + inner.log = None; + } + + #[allow(dead_code)] + pub(crate) fn take_determinism_log(&self) -> Option { + let mut inner = self.inner.lock(); + inner + .log + .take() + .or_else(|| inner.check.take().map(|(log, _)| log)) + .map(DeterminismLog) + } + + #[allow(dead_code)] + pub(crate) fn finish_determinism_check(&self) -> Result<(), String> { + let inner = self.inner.lock(); + if let Some((log, consumed)) = &inner.check { + if *consumed != log.len() { + return Err(format!( + "non-determinism detected for seed {}: consumed {consumed} of {} checkpoints", + inner.seed, + log.len() + )); + } + } + Ok(()) + } +} + +#[derive(Debug, Clone, Eq, PartialEq)] +pub(crate) struct DeterminismLog(Vec); + +fn probability_sample(value: u64, probability: f64) -> bool { + if probability <= 0.0 { + return false; + } + if probability >= 1.0 { + return true; + } + + let unit = (value >> 11) as f64 * (1.0 / ((1u64 << 53) as f64)); + unit < probability +} + +fn checksum(value: u64) -> u8 { + value.to_ne_bytes().into_iter().fold(0, |acc, byte| acc ^ byte) +} diff --git a/crates/runtime/src/sim/time/mod.rs b/crates/runtime/src/sim/time/mod.rs new file mode 100644 index 00000000000..f8bf3571cf2 --- /dev/null +++ b/crates/runtime/src/sim/time/mod.rs @@ -0,0 +1,302 @@ +mod sleep; + +use alloc::{collections::BTreeMap, sync::Arc, vec::Vec}; +use core::{fmt, future::Future, pin::pin, task::{Poll, Waker}, time::Duration}; +use sleep::wake_all; +use spin::Mutex; + +pub use sleep::Sleep; + +/// Shared virtual clock and timer registry for one simulation runtime. +/// +/// Virtual clock that only advances when explicitly driven — no wall-clock +/// progression, like Tokio's time-pause mode. +/// +/// All cloned handles observe the same virtual `now`, pending timers, and +/// timer-id sequence. The executor uses this handle both for explicit +/// time-travel operations and for jumping directly to the next pending timer +/// when the runnable queue is empty. +#[derive(Clone, Debug)] +pub struct TimeHandle { + inner: Arc>, +} + +impl TimeHandle { + pub fn new() -> Self { + Self { + inner: Arc::new(Mutex::new(TimeState::default())), + } + } + + pub fn now(&self) -> Duration { + self.inner.lock().now + } + + /// Move virtual time forward by an explicit amount. + /// + /// This is the direct "advance the clock" operation used by tests and + /// higher-level simulation code. It updates `now`, removes any timers that + /// became due at the new instant, and wakes the corresponding tasks after + /// releasing the lock. + pub fn advance(&self, duration: Duration) { + if duration.is_zero() { + return; + } + + let wakers = { + let mut state = self.inner.lock(); + state.now = state.now.saturating_add(duration); + state.take_due_wakers() + }; + wake_all(wakers); + } + + /// Jump virtual time to the earliest outstanding timer and wake it. + /// + /// The executor calls this when there are no runnable tasks left. Instead + /// of incrementing time in wall-clock steps, simulation time jumps + /// directly to the minimum timer deadline. Returns `false` if there are no + /// timers to wake. + pub fn wake_next_timer(&self) -> bool { + let wakers = { + let mut state = self.inner.lock(); + let Some(next_deadline) = state.timers.values().map(|timer| timer.deadline).min() else { + return false; + }; + if next_deadline > state.now { + state.now = next_deadline; + } + state.take_due_wakers() + }; + let woke = !wakers.is_empty(); + wake_all(wakers); + woke + } + + /// Register or refresh a timer entry for a sleeping future. + /// + /// Sleep futures keep a stable `TimerId` across polls. Re-registering with + /// the same id updates the stored waker without creating duplicate timers. + fn register_timer(&self, id: TimerId, deadline: Duration, waker: &Waker) { + let mut state = self.inner.lock(); + state.timers.insert( + id, + TimerEntry { + deadline, + waker: waker.clone(), + }, + ); + } + + /// Remove a timer entry if it is still present. + /// + /// Cancellation is best-effort because the timer may already have been + /// removed by a wakeup path before the caller reaches this point. + fn cancel_timer(&self, id: TimerId) { + self.inner.lock().timers.remove(&id); + } + + /// Allocate a fresh timer id for a new sleep future. + /// + /// Stable timer ids are what let a `Sleep` future re-register itself + /// across polls while still mapping back to a single timer entry. + fn next_timer_id(&self) -> TimerId { + let mut state = self.inner.lock(); + let id = TimerId(state.next_timer_id); + state.next_timer_id = state.next_timer_id.saturating_add(1); + id + } + + /// Create a future that becomes ready after `duration` of virtual time. + /// + /// The returned future is lazy: it does not allocate a timer entry until + /// the first poll, when it can anchor its deadline to the current virtual + /// time. + pub fn sleep(&self, duration: Duration) -> Sleep { + Sleep::new(self.clone(), duration) + } + + /// Race a future against a virtual-time sleep. + /// + /// Uses a biased `poll_fn` that polls `future` before `sleep`. If both are + /// ready in the same step, the main future wins — completion beats timeout + /// deterministically. + pub async fn timeout(&self, duration: Duration, future: impl Future) -> Result { + let sleep = self.sleep(duration); + let mut future = pin!(future); + let mut sleep = pin!(sleep); + + core::future::poll_fn(|cx| { + if let Poll::Ready(output) = future.as_mut().poll(cx) { + return Poll::Ready(Ok(output)); + } + if let Poll::Ready(()) = sleep.as_mut().poll(cx) { + return Poll::Ready(Err(TimeoutElapsed { duration })); + } + Poll::Pending + }) + .await + } +} + +impl Default for TimeHandle { + fn default() -> Self { + Self::new() + } +} + +/// Mutable state behind a [`TimeHandle`]. +/// +/// `timers` is keyed by stable `TimerId` so a `Sleep` future can refresh its +/// waker across polls without accumulating duplicate entries. A `BTreeMap` is +/// used to keep due-timer iteration deterministic. +#[derive(Debug, Default)] +struct TimeState { + now: Duration, + next_timer_id: u64, + timers: BTreeMap, +} + +impl TimeState { + /// Remove every timer whose deadline is at or before the current virtual + /// time and return their wakers. + fn take_due_wakers(&mut self) -> Vec { + let due = self + .timers + .iter() + .filter_map(|(id, timer)| (timer.deadline <= self.now).then_some(*id)) + .collect::>(); + due.into_iter() + .filter_map(|id| self.timers.remove(&id).map(|timer| timer.waker)) + .collect() + } +} + +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +struct TimerId(u64); + +/// Stored metadata for one pending timer. +#[derive(Debug)] +struct TimerEntry { + deadline: Duration, + waker: Waker, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct TimeoutElapsed { + duration: Duration, +} + +impl TimeoutElapsed { + pub fn duration(self) -> Duration { + self.duration + } +} + +impl fmt::Display for TimeoutElapsed { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "simulated timeout elapsed after {:?}", self.duration) + } +} + +#[cfg(any(feature = "tokio", feature = "simulation"))] +impl std::error::Error for TimeoutElapsed {} + +#[cfg(test)] +mod tests { + use std::{sync::Arc, time::Duration}; + + use crate::sim; + use spin::Mutex; + + #[test] + fn sleep_fast_forwards_virtual_time() { + let mut runtime = sim::Runtime::new(101); + let handle = runtime.handle(); + + runtime.block_on(async move { + assert_eq!(handle.now(), Duration::ZERO); + handle.sleep(Duration::from_millis(5)).await; + assert_eq!(handle.now(), Duration::from_millis(5)); + }); + } + + #[test] + fn shorter_timer_wakes_first() { + let mut runtime = sim::Runtime::new(102); + let handle = runtime.handle(); + let order = Arc::new(Mutex::new(Vec::new())); + + runtime.block_on({ + let order = Arc::clone(&order); + async move { + let slow_order = Arc::clone(&order); + let slow_handle = handle.clone(); + let slow = handle.spawn_on(sim::NodeId::MAIN, async move { + slow_handle.sleep(Duration::from_millis(10)).await; + slow_order.lock().push(10); + }); + + let fast_order = Arc::clone(&order); + let fast_handle = handle.clone(); + let fast = handle.spawn_on(sim::NodeId::MAIN, async move { + fast_handle.sleep(Duration::from_millis(3)).await; + fast_order.lock().push(3); + }); + + fast.await.expect("fast timer task should complete"); + slow.await.expect("slow timer task should complete"); + } + }); + + assert_eq!(*order.lock(), vec![3, 10]); + assert_eq!(runtime.elapsed(), Duration::from_millis(10)); + } + + #[test] + fn explicit_advance_moves_virtual_time() { + let mut runtime = sim::Runtime::new(103); + let handle = runtime.handle(); + + runtime.block_on(async move { + handle.advance(Duration::from_millis(7)); + assert_eq!(handle.now(), Duration::from_millis(7)); + }); + } + + #[test] + fn timeout_returns_future_output_before_deadline() { + let mut runtime = sim::Runtime::new(104); + let handle = runtime.handle(); + + let output = runtime.block_on(async move { + handle + .timeout(Duration::from_millis(10), async { + handle.sleep(Duration::from_millis(3)).await; + 9 + }) + .await + }); + + assert_eq!(output, Ok(9)); + assert_eq!(runtime.elapsed(), Duration::from_millis(3)); + } + + #[test] + fn timeout_expires_at_virtual_deadline() { + let mut runtime = sim::Runtime::new(105); + let handle = runtime.handle(); + + let output = runtime.block_on(async move { + handle + .timeout(Duration::from_millis(4), async { + handle.sleep(Duration::from_millis(20)).await; + 9 + }) + .await + }); + + assert_eq!(output.unwrap_err().duration(), Duration::from_millis(4)); + assert_eq!(runtime.elapsed(), Duration::from_millis(4)); + } +} diff --git a/crates/runtime/src/sim/time/sleep.rs b/crates/runtime/src/sim/time/sleep.rs new file mode 100644 index 00000000000..53d5555ffc3 --- /dev/null +++ b/crates/runtime/src/sim/time/sleep.rs @@ -0,0 +1,108 @@ +use alloc::vec::Vec; +use core::{ + future::Future, + pin::Pin, + task::{Context, Poll, Waker}, + time::Duration, +}; + +use super::{TimeHandle, TimerId}; + +/// Future returned by [`TimeHandle::sleep`]. +/// +/// Three-state machine: +/// +/// 1. **Unregistered** — first poll. Converts the relative `duration` into an +/// absolute `deadline` using the current virtual time and registers with the +/// time handle's timer table. Transitions to `Registered`. +/// +/// 2. **Registered** — subsequent polls. If virtual time has reached the +/// deadline, the timer is cancelled and the future returns `Ready`. +/// Otherwise, the waker is refreshed in the timer entry and the future +/// returns `Pending`. +/// +/// 3. **Done** — any later poll returns `Ready(()`) immediately. +/// +/// On drop while `Registered`, the timer entry is cancelled to prevent stale +/// wakers from firing after the future is abandoned. +pub struct Sleep { + duration: Duration, + state: SleepState, +} + +impl Sleep { + pub(super) fn new(handle: TimeHandle, duration: Duration) -> Self { + Self { + duration, + state: SleepState::Unregistered { handle }, + } + } +} + +/// Internal state machine for [`Sleep`]. +enum SleepState { + Unregistered { + handle: TimeHandle, + }, + Registered { + handle: TimeHandle, + id: TimerId, + deadline: Duration, + }, + Done, +} + +impl Future for Sleep { + type Output = (); + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + if matches!(self.state, SleepState::Done) { + return Poll::Ready(()); + } + + if let SleepState::Unregistered { handle } = &self.state { + let handle = handle.clone(); + let deadline = handle.now().saturating_add(self.duration); + let id = handle.next_timer_id(); + self.state = SleepState::Registered { handle, id, deadline }; + } + + let SleepState::Registered { handle, id, deadline } = &self.state else { + unreachable!("sleep state should be registered or done"); + }; + + if handle.now() >= *deadline { + let handle = handle.clone(); + let id = *id; + handle.cancel_timer(id); + self.state = SleepState::Done; + Poll::Ready(()) + } else { + handle.register_timer(*id, *deadline, cx.waker()); + Poll::Pending + } + } +} + +impl Drop for Sleep { + /// Remove a pending timer entry when the future is dropped early. + /// + /// This prevents stale wakers from remaining in the runtime after the + /// corresponding task has been cancelled or a timeout race has completed. + fn drop(&mut self) { + if let SleepState::Registered { handle, id, .. } = &self.state { + handle.cancel_timer(*id); + } + } +} + +/// Wake every task collected from a due-timer scan. +/// +/// Waking happens only after the time-state mutex has been released so resumed +/// tasks can inspect or mutate timer state without deadlocking on the same +/// lock. +pub(super) fn wake_all(wakers: Vec) { + for waker in wakers { + waker.wake(); + } +} diff --git a/crates/runtime/src/sim_std.rs b/crates/runtime/src/sim_std.rs new file mode 100644 index 00000000000..08f82b9495e --- /dev/null +++ b/crates/runtime/src/sim_std.rs @@ -0,0 +1,205 @@ +//! Std-hosted entry points for running the deterministic simulator in tests. +//! +//! The portable simulator lives in [`crate::sim`]. This module is deliberately +//! host-specific: it installs thread-local context while a simulation is +//! running, checks determinism by replaying a seed in fresh OS threads, and +//! intercepts a few libc calls so std code cannot silently escape determinism. + +use alloc::boxed::Box; +use core::{cell::Cell, future::Future}; +use std::sync::OnceLock; + +use crate::sim; + +// Public entry points. + +/// Run a future to completion with std-hosted determinism guards installed. +/// +/// This wraps [`sim::Runtime::block_on`] and is the normal entry point for DST +/// tests that execute inside a hosted process. While the future runs, this +/// marks the thread as inside simulation so OS thread spawns can be rejected. +pub fn block_on(runtime: &mut sim::Runtime, future: F) -> F::Output { + let _system_thread_context = enter_simulation_thread(); + runtime.block_on(future) +} + +/// Run the same future factory twice and assert that both runs consume the same +/// deterministic RNG/scheduler trace. +/// +/// Each pass runs on a fresh OS thread so thread-local std state is not shared +/// between the recording and replay passes. +pub fn check_determinism(seed: u64, make_future: M) -> F::Output +where + M: Fn() -> F + Clone + Send + 'static, + F: Future + 'static, + F::Output: Send + 'static, +{ + let first = make_future.clone(); + let log = std::thread::spawn(move || { + let mut runtime = sim::Runtime::new(seed); + runtime.enable_determinism_log(); + block_on(&mut runtime, first()); + runtime + .take_determinism_log() + .expect("determinism log should be enabled") + }) + .join() + .map_err(|payload| panic_with_seed(seed, payload)) + .unwrap(); + + std::thread::spawn(move || { + let mut runtime = sim::Runtime::new(seed); + runtime.enable_determinism_check(log); + let output = block_on(&mut runtime, make_future()); + runtime.finish_determinism_check().unwrap_or_else(|err| panic!("{err}")); + output + }) + .join() + .map_err(|payload| panic_with_seed(seed, payload)) + .unwrap() +} + +fn panic_with_seed(seed: u64, payload: Box) -> ! { + eprintln!("note: run with --seed {seed} to reproduce this error"); + std::panic::resume_unwind(payload); +} + +// Simulation thread context. + +// Ambient state used only while `sim_std::block_on` is driving a simulation. +// +// The simulator itself stays explicit-handle based. This thread-local only +// marks whether the current OS thread is owned by a running simulation so +// host thread creation can be rejected. +thread_local! { + // Marks the current OS thread as simulation-owned so thread creation hooks + // can reject accidental escapes to the host scheduler. + static IN_SIMULATION: Cell = const { Cell::new(false) }; +} + +struct SimulationThreadGuard { + previous: bool, +} + +fn enter_simulation_thread() -> SimulationThreadGuard { + let previous = IN_SIMULATION.with(|state| state.replace(true)); + SimulationThreadGuard { previous } +} + +fn in_simulation() -> bool { + IN_SIMULATION.with(Cell::get) +} + +impl Drop for SimulationThreadGuard { + fn drop(&mut self) { + IN_SIMULATION.with(|state| { + state.set(self.previous); + }); + } +} + +// Thread hook. + +// Hook Unix thread creation by interposing `pthread_attr_init`. +// +// `std::thread::Builder::spawn` initializes pthread attributes before creating +// the thread. Returning an error here while simulation is active makes hidden +// OS thread creation fail early, before host scheduling can affect replay. +// Outside simulation, this delegates to the real libc symbol through `RTLD_NEXT`. +#[cfg(unix)] +#[unsafe(no_mangle)] +#[inline(never)] +unsafe extern "C" fn pthread_attr_init(attr: *mut libc::pthread_attr_t) -> libc::c_int { + // std::thread enters libc through pthread_attr_init on Unix. Refusing that + // call while in simulation keeps hidden OS scheduling out of DST. + if in_simulation() { + eprintln!("attempt to spawn a system thread in simulation."); + eprintln!("note: use simulator tasks instead."); + return -1; + } + + type PthreadAttrInit = unsafe extern "C" fn(*mut libc::pthread_attr_t) -> libc::c_int; + static PTHREAD_ATTR_INIT: OnceLock = OnceLock::new(); + let original = PTHREAD_ATTR_INIT.get_or_init(|| unsafe { + // `RTLD_NEXT` skips this interposed function and finds the libc + // implementation that would have been called without the simulator. + let ptr = libc::dlsym(libc::RTLD_NEXT, c"pthread_attr_init".as_ptr().cast()); + assert!(!ptr.is_null(), "failed to resolve original pthread_attr_init"); + std::mem::transmute(ptr) + }); + unsafe { original(attr) } +} + +// Randomness syscall hooks. + +// Hook OS randomness by interposing `getrandom`. +// +// This crate no longer tries to make host randomness deterministic. Any such +// request is surfaced with a warning and then delegated to the host OS. +#[unsafe(no_mangle)] +#[inline(never)] +unsafe extern "C" fn getrandom(buf: *mut u8, buflen: usize, flags: u32) -> isize { + if in_simulation() { + eprintln!("warning: randomness requested; delegating to host OS"); + eprintln!("{}", std::backtrace::Backtrace::force_capture()); + } + unsafe { real_getrandom()(buf, buflen, flags) } +} + +#[cfg(target_os = "linux")] +fn real_getrandom() -> unsafe extern "C" fn(*mut u8, usize, u32) -> isize { + type GetrandomFn = unsafe extern "C" fn(*mut u8, usize, u32) -> isize; + static GETRANDOM: OnceLock = OnceLock::new(); + *GETRANDOM.get_or_init(|| unsafe { + let ptr = libc::dlsym(libc::RTLD_NEXT, c"getrandom".as_ptr().cast()); + assert!(!ptr.is_null(), "failed to resolve original getrandom"); + std::mem::transmute(ptr) + }) +} + +#[cfg(not(target_os = "linux"))] +fn real_getrandom() -> unsafe extern "C" fn(*mut u8, usize, u32) -> isize { + compile_error!("unsupported OS for DST getrandom override"); +} + +// Hook `getentropy` and route it through the same deterministic path as +// `getrandom`. +// +// The 256-byte limit is part of the getentropy contract. Keeping this wrapper +// small means all entropy decisions stay centralized in `getrandom`. +#[unsafe(no_mangle)] +#[inline(never)] +unsafe extern "C" fn getentropy(buf: *mut u8, buflen: usize) -> i32 { + if buflen > 256 { + return -1; + } + match unsafe { getrandom(buf, buflen, 0) } { + -1 => -1, + _ => 0, + } +} + +#[cfg(test)] +mod tests { + use crate::sim; + + use super::getentropy; + + #[test] + #[cfg(unix)] + fn runtime_forbids_system_thread_spawn() { + let mut runtime = sim::Runtime::new(200); + super::block_on(&mut runtime, async { + let result = std::panic::catch_unwind(|| std::thread::Builder::new().spawn(|| {})); + assert!(result.is_err()); + }); + } + + #[test] + fn getentropy_delegates_to_host_randomness_outside_simulation() { + let mut actual = [0u8; 24]; + unsafe { + assert_eq!(getentropy(actual.as_mut_ptr(), actual.len()), 0); + } + } +} diff --git a/crates/runtime/tests/sim_e2e.rs b/crates/runtime/tests/sim_e2e.rs new file mode 100644 index 00000000000..1f505696801 --- /dev/null +++ b/crates/runtime/tests/sim_e2e.rs @@ -0,0 +1,366 @@ +#![cfg(feature = "simulation")] + +use std::{sync::Arc, time::Duration}; + +use futures::{ + channel::{mpsc, oneshot}, + StreamExt, +}; +use spacetimedb_runtime::sim::{buggify, Rng, Runtime}; +use spin::Mutex; + +/// One reply produced by the simulated server. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct Response { + id: u64, + value: u64, + at: Duration, +} + +/// Trace entries recorded by the server so tests can assert schedule/fault outcomes. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum ServerEvent { + Received { id: u64, at: Duration }, + Dropped { id: u64, at: Duration }, + Replied { id: u64, at: Duration }, +} + +/// A client request submitted to the simulated server. +struct Request { + id: u64, + input: u64, + respond_to: oneshot::Sender, +} + +/// Complete result of the client/server workload for one seed. +#[derive(Debug, Eq, PartialEq)] +struct ClientServerRun { + responses: Vec<(u64, Option)>, + server_events: Vec, + elapsed: Duration, +} + +/// Checks the "same seed, same trace" side of the client/server workload. +/// Both the client-visible results and the server-side event trace should stay +/// stable for one fixed seed. +#[test] +fn client_server_buggify_injects_deterministic_faults() { + let run = run_buggified_client_server(404); + + assert_eq!( + run.responses, + vec![ + (0, None), + ( + 1, + Some(Response { + id: 1, + value: 50, + at: Duration::from_millis(2), + }), + ), + ( + 2, + Some(Response { + id: 2, + value: 70, + at: Duration::from_millis(3), + }), + ), + (3, None), + ( + 4, + Some(Response { + id: 4, + value: 110, + at: Duration::from_millis(5), + }), + ), + ] + ); + assert_eq!( + run.server_events, + vec![ + ServerEvent::Received { + id: 3, + at: Duration::ZERO, + }, + ServerEvent::Received { + id: 0, + at: Duration::ZERO, + }, + ServerEvent::Received { + id: 2, + at: Duration::ZERO, + }, + ServerEvent::Received { + id: 4, + at: Duration::ZERO, + }, + ServerEvent::Received { + id: 1, + at: Duration::ZERO, + }, + ServerEvent::Dropped { + id: 0, + at: Duration::from_millis(1), + }, + ServerEvent::Replied { + id: 1, + at: Duration::from_millis(2), + }, + ServerEvent::Replied { + id: 2, + at: Duration::from_millis(3), + }, + ServerEvent::Dropped { + id: 3, + at: Duration::from_millis(4), + }, + ServerEvent::Replied { + id: 4, + at: Duration::from_millis(5), + }, + ] + ); + assert_eq!(run.elapsed, Duration::from_millis(5)); +} + +/// Checks the "different seed, different exploration" side of the same +/// client/server workload. The full run result should differ across seeds. +#[test] +fn client_server_buggify_differs_across_seeds() { + let seed_404 = run_buggified_client_server(404); + let seed_405 = run_buggified_client_server(405); + + eprintln!("seed 404: {seed_404:#?}"); + eprintln!("seed 405: {seed_405:#?}"); + assert_ne!(seed_404, seed_405); +} + +/// Fixed request set used by the client workload. +const CLIENT_REQUESTS: [(u64, u64); 5] = [(0, 4), (1, 5), (2, 7), (3, 9), (4, 11)]; + +/// Run a small concurrent client/server workload under one seed. +/// +/// The client submits every request from its own simulated task. The server +/// receives requests in scheduler order, then spawns one worker per request. +/// Each worker sleeps for deterministic virtual latency and may drop the reply +/// based on buggify. +fn run_buggified_client_server(seed: u64) -> ClientServerRun { + // --- setup: runtime, buggify, two nodes, and communication channels --- + let mut runtime = Runtime::new(seed); + buggify::enable(&runtime); + let handle = runtime.handle(); + let client_node = runtime.create_node().name("client").build(); + let server_node = runtime.create_node().name("server").build(); + // mpsc channel: client tasks send Request messages to the server task + let (request_tx, mut request_rx) = mpsc::unbounded::(); + let server_events = Arc::new(Mutex::new(Vec::new())); + + let (responses, server_events) = runtime.block_on(async move { + // --- server: receive 5 requests, spawn one worker per request --- + let server_handle = handle.clone(); + let server_events_for_server = Arc::clone(&server_events); + let server = server_node.clone().spawn(async move { + let mut workers = Vec::new(); + // Receive all 5 requests before processing any replies + for _ in 0..5 { + let request = request_rx.next().await.expect("client should send request"); + server_events_for_server.lock().push(ServerEvent::Received { + id: request.id, + at: server_handle.now(), + }); + + // --- server worker: simulate latency, then drop or reply based on buggify --- + let worker_handle = server_handle.clone(); + let worker_events = Arc::clone(&server_events_for_server); + workers.push(server_node.clone().spawn(async move { + // Deterministic virtual latency: each request id has a distinct sleep + worker_handle.sleep(Duration::from_millis(request.id + 1)).await; + // buggify decides whether to drop this request (40% probability) + if worker_handle.buggify_with_prob(0.4) { + worker_events.lock().push(ServerEvent::Dropped { + id: request.id, + at: worker_handle.now(), + }); + return; + } + + // No fault injected: send the reply + let response = Response { + id: request.id, + value: request.input * 10, + at: worker_handle.now(), + }; + worker_events.lock().push(ServerEvent::Replied { + id: request.id, + at: response.at, + }); + request + .respond_to + .send(response) + .expect("client should wait for response"); + })); + } + + // Wait for all server workers to complete + for worker in workers { + worker.await.expect("server worker should complete"); + } + }); + + // --- client: spawn one task per request, send them to server, collect responses --- + let client_outer_node = client_node.clone(); + let client = client_node.spawn(async move { + let mut requests = Vec::new(); + // Spawn a task for each request so they submit concurrently + for (id, input) in CLIENT_REQUESTS { + let request_tx = request_tx.clone(); + let client_request_node = client_outer_node.clone(); + requests.push(client_request_node.spawn(async move { + let (respond_to, response_rx) = oneshot::channel(); + request_tx + .unbounded_send(Request { id, input, respond_to }) + .expect("server inbox should be open"); + // Await the server's reply (None if the server dropped this request) + (id, response_rx.await.ok()) + })); + } + // All requests sent, close the channel so the server loop terminates + drop(request_tx); + + // Collect responses in spawn order + let mut responses = Vec::new(); + for request in requests { + responses.push(request.await.expect("client request task should complete")); + } + responses + }); + + // Drive both client and server to completion + let responses = client.await.expect("client task should complete"); + server.await.expect("server task should complete"); + (responses, server_events.lock().clone()) + }); + + // --- package the results: client responses, server trace, and total virtual time --- + ClientServerRun { + responses, + server_events, + elapsed: runtime.elapsed(), + } +} + +/// Exercises the executor, node pause/resume, and timer wheel together: +/// paused node work must not run until resumed, and all nodes must observe +/// one shared virtual clock. +#[test] +fn multi_node_runtime_coordinates_pause_resume_and_virtual_time() { + let mut runtime = Runtime::new(101); + let handle = runtime.handle(); + let node_a = runtime.create_node().name("a").build(); + let node_b = runtime.create_node().name("b").build(); + let events = Arc::new(Mutex::new(Vec::new())); + + node_b.pause(); + + runtime.block_on({ + let events = Arc::clone(&events); + async move { + let a_handle = handle.clone(); + let a_events = Arc::clone(&events); + let a = node_a.spawn(async move { + a_events.lock().push(("a_started", a_handle.now())); + a_handle.sleep(Duration::from_millis(3)).await; + a_events.lock().push(("a_finished", a_handle.now())); + }); + + let b_handle = handle.clone(); + let b_events = Arc::clone(&events); + let b = node_b.spawn(async move { + b_events.lock().push(("b_started", b_handle.now())); + b_handle.sleep(Duration::from_millis(2)).await; + b_events.lock().push(("b_finished", b_handle.now())); + }); + + handle.sleep(Duration::from_millis(1)).await; + events.lock().push(("main_resumed_b", handle.now())); + node_b.resume(); + + a.await.expect("node a task should complete"); + b.await.expect("node b task should complete"); + } + }); + + let events = events.lock().clone(); + assert!(events.contains(&("a_started", Duration::ZERO))); + assert!(events.contains(&("main_resumed_b", Duration::from_millis(1)))); + assert!(events.contains(&("b_started", Duration::from_millis(1)))); + assert!(events.contains(&("a_finished", Duration::from_millis(3)))); + assert!(events.contains(&("b_finished", Duration::from_millis(3)))); + assert_eq!(runtime.elapsed(), Duration::from_millis(3)); +} + +/// Checks that runtime-owned buggify decisions consume the same seeded RNG +/// sequence as an explicit `Rng`, making injected faults replayable by seed. +#[test] +fn runtime_buggify_matches_standalone_rng_sequence() { + let seed = 77; + let runtime = Runtime::new(seed); + let expected = Rng::new(seed); + + buggify::enable(&runtime); + expected.enable_buggify(); + + let actual = (0..8) + .map(|_| buggify::should_inject_fault_with_prob(&runtime, 0.5)) + .collect::>(); + let expected = (0..8).map(|_| expected.buggify_with_prob(0.5)).collect::>(); + + assert_eq!(actual, expected); + assert!(buggify::is_enabled(&runtime)); + + buggify::disable(&runtime); + assert!(!buggify::is_enabled(&runtime)); + assert!(!buggify::should_inject_fault_with_prob(&runtime, 1.0)); +} + +/// Verifies timeout races are driven by virtual time, not wall time: the fast +/// node completes at 2ms, then the slow node times out at the shared 4ms +/// deadline. +#[test] +fn multi_node_timeout_uses_shared_virtual_clock() { + let mut runtime = Runtime::new(303); + let handle = runtime.handle(); + let slow_node = runtime.create_node().name("slow").build(); + let fast_node = runtime.create_node().name("fast").build(); + + let output = runtime.block_on(async move { + let slow_handle = handle.clone(); + let slow = slow_node.spawn(async move { + slow_handle + .timeout(Duration::from_millis(4), async { + slow_handle.sleep(Duration::from_millis(10)).await; + "slow-finished" + }) + .await + }); + + let fast_handle = handle.clone(); + let fast = fast_node.spawn(async move { + fast_handle.sleep(Duration::from_millis(2)).await; + ("fast-finished", fast_handle.now()) + }); + + ( + slow.await.expect("slow node task should complete"), + fast.await.expect("fast node task should complete"), + ) + }); + + let (slow, fast) = output; + assert_eq!(fast, ("fast-finished", Duration::from_millis(2))); + assert_eq!(slow.unwrap_err().duration(), Duration::from_millis(4)); + assert_eq!(runtime.elapsed(), Duration::from_millis(4)); +} diff --git a/crates/snapshot/Cargo.toml b/crates/snapshot/Cargo.toml index f9f767ce18e..aa51c4e3bd8 100644 --- a/crates/snapshot/Cargo.toml +++ b/crates/snapshot/Cargo.toml @@ -35,6 +35,7 @@ spacetimedb-core = { path = "../core", features = ["test"] } spacetimedb-schema = { path = "../schema" } spacetimedb-datastore = { path = "../datastore", features = ["test"] } spacetimedb-durability = { workspace = true, features = ["test"] } +spacetimedb-runtime = { workspace = true } anyhow.workspace = true env_logger.workspace = true diff --git a/crates/snapshot/src/lib.rs b/crates/snapshot/src/lib.rs index 6af30dc0f26..55ae62f074b 100644 --- a/crates/snapshot/src/lib.rs +++ b/crates/snapshot/src/lib.rs @@ -48,6 +48,7 @@ use std::fs::{self, File}; use std::io; use std::ops::{Range, RangeBounds}; use std::path::Path; +use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; use std::time::{Duration, Instant}; use std::{ collections::BTreeMap, @@ -1369,20 +1370,23 @@ impl SnapshotRepository { } } -/// Snapshot storage backend. -pub trait SnapshotRepo: Send + Sync { - type Pending: PendingSnapshot; - +/// Snapshot storage backend that can capture, read, list, and invalidate snapshots. +/// +/// Production uses the filesystem-backed [`SnapshotRepository`]. DST can use +/// [`MemorySnapshotRepository`] to keep snapshot storage inside the simulator +/// boundary instead of depending on temporary directories or host filesystem +/// behavior. +pub trait SnapshotStore: Send + Sync { /// Return the database identity associated with this snapshot backend. fn database_identity(&self) -> Identity; - /// Start creating a snapshot at `tx_offset` from the provided tables and blob store. - fn create_snapshot<'db>( + /// Capture and finalize a snapshot at `tx_offset`. + fn capture_snapshot<'db>( &self, tables: &mut dyn Iterator, blobs: &'db dyn BlobStore, tx_offset: TxOffset, - ) -> Result; + ) -> Result; /// Reconstruct the snapshot at `tx_offset` using the supplied page pool. fn read_snapshot(&self, tx_offset: TxOffset, page_pool: &PagePool) -> Result; @@ -1395,6 +1399,25 @@ pub trait SnapshotRepo: Send + Sync { self.latest_snapshot_older_than(TxOffset::MAX) } + /// Invalidate every snapshot newer than `upper_bound`. + fn invalidate_newer_snapshots(&self, upper_bound: TxOffset) -> Result<(), SnapshotError>; + + /// Invalidate the snapshot at `tx_offset`. + fn invalidate_snapshot(&self, tx_offset: TxOffset) -> Result<(), SnapshotError>; +} + +/// Filesystem-style snapshot backend with a pending snapshot phase and optional compression. +pub trait SnapshotRepo: SnapshotStore { + type Pending: PendingSnapshot; + + /// Start creating a snapshot at `tx_offset` from the provided tables and blob store. + fn create_snapshot<'db>( + &self, + tables: &mut dyn Iterator, + blobs: &'db dyn BlobStore, + tx_offset: TxOffset, + ) -> Result; + /// Attempt to compress all snapshots that fall into `range`, and record /// the outcome in `stats`. /// @@ -1403,30 +1426,21 @@ pub trait SnapshotRepo: Send + Sync { /// /// See [CompressionStats] for how to interpret the results. fn compress_snapshots(&self, stats: &mut CompressionStats, range: Range) -> Result<(), SnapshotError>; - - /// Invalidate every snapshot newer than `upper_bound`. - fn invalidate_newer_snapshots(&self, upper_bound: TxOffset) -> Result<(), SnapshotError>; - - /// Invalidate the snapshot at `tx_offset`. - fn invalidate_snapshot(&self, tx_offset: TxOffset) -> Result<(), SnapshotError>; } -impl SnapshotRepo for SnapshotRepository { - type Pending = BoxedPendingSnapshot; - +impl SnapshotStore for SnapshotRepository { fn database_identity(&self) -> Identity { SnapshotRepository::database_identity(self) } - fn create_snapshot<'db>( + fn capture_snapshot<'db>( &self, tables: &mut dyn Iterator, blobs: &'db dyn BlobStore, tx_offset: TxOffset, - ) -> Result { - Ok(Box::new(SnapshotRepository::create_snapshot( - self, tables, blobs, tx_offset, - )?)) + ) -> Result { + self.create_snapshot(tables, blobs, tx_offset)?.sync_all()?; + Ok(tx_offset) } fn read_snapshot(&self, tx_offset: TxOffset, page_pool: &PagePool) -> Result { @@ -1441,17 +1455,316 @@ impl SnapshotRepo for SnapshotRepository { SnapshotRepository::latest_snapshot(self) } + fn invalidate_newer_snapshots(&self, upper_bound: TxOffset) -> Result<(), SnapshotError> { + SnapshotRepository::invalidate_newer_snapshots(self, upper_bound) + } + + fn invalidate_snapshot(&self, tx_offset: TxOffset) -> Result<(), SnapshotError> { + SnapshotRepository::invalidate_snapshot(self, tx_offset) + } +} + +impl SnapshotRepo for SnapshotRepository { + type Pending = BoxedPendingSnapshot; + + fn create_snapshot<'db>( + &self, + tables: &mut dyn Iterator, + blobs: &'db dyn BlobStore, + tx_offset: TxOffset, + ) -> Result { + Ok(Box::new(SnapshotRepository::create_snapshot( + self, tables, blobs, tx_offset, + )?)) + } + fn compress_snapshots(&self, stats: &mut CompressionStats, range: Range) -> Result<(), SnapshotError> { SnapshotRepository::compress_snapshots(self, stats, range) } +} + +/// In-memory snapshot repository for deterministic tests. +/// +/// This stores snapshot object bytes in process memory and reconstructs through +/// the same [`ReconstructedSnapshot`] shape as the filesystem repository. It is +/// not durable and intentionally does not model the on-disk two-phase flush +/// protocol; it is a simulator/test backend for semantic snapshot capture and +/// restore. +pub struct MemorySnapshotRepository { + database_identity: Identity, + replica_id: u64, + snapshots: RwLock>, +} + +impl MemorySnapshotRepository { + pub fn new(database_identity: Identity, replica_id: u64) -> Self { + Self { + database_identity, + replica_id, + snapshots: RwLock::new(BTreeMap::new()), + } + } + + pub fn database_identity(&self) -> Identity { + self.database_identity + } + + pub fn capture_snapshot<'db>( + &self, + tables: impl Iterator, + blobs: &'db dyn BlobStore, + tx_offset: TxOffset, + ) -> Result { + self.invalidate_newer_snapshots(tx_offset.saturating_sub(1))?; + let snapshot = MemorySnapshot::capture(self.database_identity, self.replica_id, tables, blobs, tx_offset)?; + self.write_snapshots()?.insert(tx_offset, snapshot); + Ok(tx_offset) + } + + pub fn read_snapshot( + &self, + tx_offset: TxOffset, + page_pool: &PagePool, + ) -> Result { + let snapshot = self + .read_snapshots()? + .get(&tx_offset) + .cloned() + .ok_or_else(|| memory_snapshot_not_found(tx_offset))?; + snapshot.reconstruct(page_pool) + } + + pub fn latest_snapshot_older_than(&self, upper_bound: TxOffset) -> Result, SnapshotError> { + Ok(self + .read_snapshots()? + .range(..=upper_bound) + .next_back() + .map(|(&tx_offset, _)| tx_offset)) + } + + pub fn latest_snapshot(&self) -> Result, SnapshotError> { + self.latest_snapshot_older_than(TxOffset::MAX) + } + + pub fn invalidate_newer_snapshots(&self, upper_bound: TxOffset) -> Result<(), SnapshotError> { + self.write_snapshots()?.retain(|tx_offset, _| *tx_offset <= upper_bound); + Ok(()) + } + + pub fn invalidate_snapshot(&self, tx_offset: TxOffset) -> Result<(), SnapshotError> { + self.write_snapshots()?.remove(&tx_offset); + Ok(()) + } + + fn read_snapshots(&self) -> Result>, SnapshotError> { + self.snapshots.read().map_err(|_| memory_snapshot_lock_poisoned()) + } + + fn write_snapshots(&self) -> Result>, SnapshotError> { + self.snapshots.write().map_err(|_| memory_snapshot_lock_poisoned()) + } +} + +impl SnapshotStore for MemorySnapshotRepository { + fn database_identity(&self) -> Identity { + MemorySnapshotRepository::database_identity(self) + } + + fn capture_snapshot<'db>( + &self, + tables: &mut dyn Iterator, + blobs: &'db dyn BlobStore, + tx_offset: TxOffset, + ) -> Result { + MemorySnapshotRepository::capture_snapshot(self, tables, blobs, tx_offset) + } + + fn read_snapshot(&self, tx_offset: TxOffset, page_pool: &PagePool) -> Result { + MemorySnapshotRepository::read_snapshot(self, tx_offset, page_pool) + } + + fn latest_snapshot_older_than(&self, upper_bound: TxOffset) -> Result, SnapshotError> { + MemorySnapshotRepository::latest_snapshot_older_than(self, upper_bound) + } + + fn latest_snapshot(&self) -> Result, SnapshotError> { + MemorySnapshotRepository::latest_snapshot(self) + } fn invalidate_newer_snapshots(&self, upper_bound: TxOffset) -> Result<(), SnapshotError> { - SnapshotRepository::invalidate_newer_snapshots(self, upper_bound) + MemorySnapshotRepository::invalidate_newer_snapshots(self, upper_bound) } fn invalidate_snapshot(&self, tx_offset: TxOffset) -> Result<(), SnapshotError> { - SnapshotRepository::invalidate_snapshot(self, tx_offset) + MemorySnapshotRepository::invalidate_snapshot(self, tx_offset) + } +} + +struct MemoryPendingSnapshot { + tx_offset: TxOffset, +} + +impl PendingSnapshot for MemoryPendingSnapshot { + fn sync_all(self: Box) -> Result { + Ok(self.tx_offset) + } +} + +impl SnapshotRepo for MemorySnapshotRepository { + type Pending = BoxedPendingSnapshot; + + fn create_snapshot<'db>( + &self, + tables: &mut dyn Iterator, + blobs: &'db dyn BlobStore, + tx_offset: TxOffset, + ) -> Result { + self.capture_snapshot(tables, blobs, tx_offset)?; + Ok(Box::new(MemoryPendingSnapshot { tx_offset })) + } + + fn compress_snapshots(&self, _stats: &mut CompressionStats, _range: Range) -> Result<(), SnapshotError> { + Ok(()) + } +} + +#[derive(Clone)] +struct MemorySnapshot { + database_identity: Identity, + replica_id: u64, + tx_offset: TxOffset, + module_abi_version: [u16; 2], + blobs: Vec, + tables: BTreeMap>, +} + +impl MemorySnapshot { + fn capture<'db>( + database_identity: Identity, + replica_id: u64, + tables: impl Iterator, + blobs: &'db dyn BlobStore, + tx_offset: TxOffset, + ) -> Result { + let blobs = blobs + .iter_blobs() + .map(|(hash, uses, bytes)| MemoryBlob { + hash: *hash, + uses: uses as u32, + bytes: bytes.into(), + }) + .collect(); + + let tables = tables + .map(|table| { + let pages = table + .iter_pages_with_hashes() + .map(|(hash, page)| { + let bytes = bsatn::to_vec(page).map_err(|cause| SnapshotError::Serialize { + ty: ObjectType::Page(hash), + cause, + })?; + Ok(MemoryPage { hash, bytes }) + }) + .collect::, SnapshotError>>()?; + Ok((table.schema.table_id, pages)) + }) + .collect::, SnapshotError>>()?; + + Ok(Self { + database_identity, + replica_id, + tx_offset, + module_abi_version: CURRENT_MODULE_ABI_VERSION, + blobs, + tables, + }) } + + fn reconstruct(self, page_pool: &PagePool) -> Result { + let source_repo = memory_snapshot_path(self.tx_offset); + let mut blob_store = HashMapBlobStore::default(); + for MemoryBlob { hash, uses, bytes } in self.blobs { + let computed = BlobHash::hash_from_bytes(&bytes); + if hash != computed { + return Err(SnapshotError::HashMismatch { + ty: ObjectType::Blob(hash), + expected: hash.data, + computed: computed.data, + source_repo: source_repo.clone(), + }); + } + blob_store.insert_with_uses(&hash, uses as usize, bytes); + } + + let tables = + self.tables + .into_iter() + .map(|(table_id, pages)| { + let pages = pages + .into_iter() + .map(|MemoryPage { hash, bytes }| { + let page = page_pool.take_deserialize_from(&bytes).map_err(|cause| { + SnapshotError::Deserialize { + ty: ObjectType::Page(hash), + source_repo: source_repo.clone(), + cause, + } + })?; + let computed = page.content_hash(); + if hash != computed { + return Err(SnapshotError::HashMismatch { + ty: ObjectType::Page(hash), + expected: *hash.as_bytes(), + computed: *computed.as_bytes(), + source_repo: source_repo.clone(), + }); + } + Ok(page) + }) + .collect::, SnapshotError>>()?; + Ok((table_id, pages)) + }) + .collect::, SnapshotError>>()?; + + Ok(ReconstructedSnapshot { + database_identity: self.database_identity, + replica_id: self.replica_id, + tx_offset: self.tx_offset, + module_abi_version: self.module_abi_version, + blob_store, + tables, + compress_type: CompressType::None, + }) + } +} + +#[derive(Clone)] +struct MemoryBlob { + hash: BlobHash, + uses: u32, + bytes: Box<[u8]>, +} + +#[derive(Clone)] +struct MemoryPage { + hash: blake3::Hash, + bytes: Vec, +} + +fn memory_snapshot_lock_poisoned() -> SnapshotError { + SnapshotError::Io(io::Error::other("memory snapshot repository lock poisoned")) +} + +fn memory_snapshot_not_found(tx_offset: TxOffset) -> SnapshotError { + SnapshotError::Io(io::Error::new( + io::ErrorKind::NotFound, + format!("memory snapshot {tx_offset} not found"), + )) +} + +fn memory_snapshot_path(tx_offset: TxOffset) -> PathBuf { + PathBuf::from(format!("")) } pub struct ReconstructedSnapshot { diff --git a/crates/snapshot/tests/remote.rs b/crates/snapshot/tests/remote.rs index 41097b33abd..e7133191ffa 100644 --- a/crates/snapshot/tests/remote.rs +++ b/crates/snapshot/tests/remote.rs @@ -23,6 +23,7 @@ use spacetimedb_lib::{ }; use spacetimedb_paths::{server::SnapshotsPath, FromPathUnchecked}; use spacetimedb_primitives::TableId; +use spacetimedb_runtime::Handle; use spacetimedb_sats::{product, raw_identifier::RawIdentifier}; use spacetimedb_schema::{ def::ModuleDef, @@ -227,14 +228,16 @@ impl SourceSnapshot { async fn create_snapshot(repo: Arc) -> anyhow::Result { let start = Instant::now(); - let rt = tokio::runtime::Handle::current(); + let rt = spacetimedb_runtime::Handle::tokio_current(); // NOTE: `_db` needs to stay alive until the snapshot is taken, // because the snapshot worker holds only a weak reference. - let (mut watch, _db) = spawn_blocking(|| { + let (mut watch, _db) = spawn_blocking(move || { + let snapshot_worker = SnapshotWorker::new(repo, snapshot::Compression::Disabled, rt.clone()); let persistence = Persistence { durability: Arc::new(NoDurability::default()), disk_size: Arc::new(|| Ok(<_>::default())), - snapshots: Some(SnapshotWorker::new(repo, snapshot::Compression::Disabled)), + snapshot_store: Some(snapshot_worker.snapshot_store()), + snapshots: Some(snapshot_worker), runtime: rt, }; let db = TestDB::open_db(EmptyHistory::new(), Some(persistence), None, 0)?; diff --git a/crates/standalone/src/subcommands/start.rs b/crates/standalone/src/subcommands/start.rs index 50f6db19257..bc8241938d2 100644 --- a/crates/standalone/src/subcommands/start.rs +++ b/crates/standalone/src/subcommands/start.rs @@ -1,12 +1,18 @@ +#[cfg(not(simulation))] use netstat2::{get_sockets_info, AddressFamilyFlags, ProtocolFlags, ProtocolSocketInfo, TcpState}; +#[cfg(not(simulation))] use spacetimedb_client_api::routes::identity::IdentityRoutes; +#[cfg(not(simulation))] use spacetimedb_pg::pg_server; +#[cfg(not(simulation))] use std::io::{self, Write}; +#[cfg(not(simulation))] use std::net::IpAddr; use std::sync::Arc; use crate::{StandaloneEnv, StandaloneOptions}; use anyhow::Context; +#[cfg(not(simulation))] use axum::extract::DefaultBodyLimit; use clap::ArgAction::SetTrue; use clap::{Arg, ArgMatches}; @@ -15,11 +21,14 @@ use spacetimedb::db::{self, Storage}; use spacetimedb::startup::{self, TracingOptions}; use spacetimedb::util::jobs::JobCores; use spacetimedb::worker_metrics; +#[cfg(not(simulation))] use spacetimedb_client_api::routes::database::DatabaseRoutes; +#[cfg(not(simulation))] use spacetimedb_client_api::routes::router; use spacetimedb_client_api::routes::subscribe::WebSocketOptions; use spacetimedb_paths::cli::{PrivKeyPath, PubKeyPath}; use spacetimedb_paths::server::{ConfigToml, ServerDataDir}; +#[cfg(not(simulation))] use tokio::net::TcpListener; pub fn cli() -> clap::Command { @@ -111,6 +120,7 @@ impl ConfigFile { pub async fn exec(args: &ArgMatches, db_cores: JobCores) -> anyhow::Result<()> { let listen_addr = args.get_one::("listen_addr").unwrap(); let pg_port = args.get_one::("pg_port"); + #[cfg(not(simulation))] let non_interactive = args.get_flag("non_interactive"); let cert_dir = args.get_one::("jwt_key_dir"); let certs = Option::zip( @@ -198,13 +208,26 @@ pub async fn exec(args: &ArgMatches, db_cores: JobCores) -> anyhow::Result<()> { ); worker_metrics::spawn_page_pool_stats(listen_addr.clone(), ctx.page_pool().clone()); worker_metrics::spawn_bsatn_rlb_pool_stats(listen_addr.clone(), ctx.bsatn_rlb_pool().clone()); + #[cfg(simulation)] + { + let _ = (pg_port, ctx, listen_addr); + anyhow::bail!("standalone start server mode is not supported under simulation"); + } + + #[cfg(not(simulation))] let mut db_routes = DatabaseRoutes::default(); - db_routes.root_post = db_routes.root_post.layer(DefaultBodyLimit::disable()); - db_routes.db_put = db_routes.db_put.layer(DefaultBodyLimit::disable()); - db_routes.pre_publish = db_routes.pre_publish.layer(DefaultBodyLimit::disable()); + #[cfg(not(simulation))] + { + db_routes.root_post = db_routes.root_post.layer(DefaultBodyLimit::disable()); + db_routes.db_put = db_routes.db_put.layer(DefaultBodyLimit::disable()); + db_routes.pre_publish = db_routes.pre_publish.layer(DefaultBodyLimit::disable()); + } + #[cfg(not(simulation))] let extra = axum::Router::new().nest("/health", spacetimedb_client_api::routes::health::router()); + #[cfg(not(simulation))] let service = router(&ctx, db_routes, IdentityRoutes::default(), extra).with_state(ctx.clone()); + #[cfg(not(simulation))] // Check if the requested port is available on both IPv4 and IPv6. // If not, offer to find an available port by incrementing (unless non-interactive). let listen_addr = if let Some((host, port_str)) = listen_addr.rsplit_once(':') { @@ -250,40 +273,44 @@ pub async fn exec(args: &ArgMatches, db_cores: JobCores) -> anyhow::Result<()> { listen_addr.to_string() }; - let tcp = TcpListener::bind(&listen_addr).await.context(format!( - "failed to bind the SpacetimeDB server to '{listen_addr}', please check that the address is valid and not already in use" - ))?; - socket2::SockRef::from(&tcp).set_nodelay(true)?; - log::info!("Starting SpacetimeDB listening on {}", tcp.local_addr()?); - - if let Some(pg_port) = pg_port { - let server_addr = listen_addr.split(':').next().unwrap(); - let tcp_pg = TcpListener::bind(format!("{server_addr}:{pg_port}")).await.context(format!( - "failed to bind the SpacetimeDB PostgreSQL wire protocol server to {server_addr}:{pg_port}, please check that the port is valid and not already in use" + #[cfg(not(simulation))] + { + let tcp = TcpListener::bind(&listen_addr).await.context(format!( + "failed to bind the SpacetimeDB server to '{listen_addr}', please check that the address is valid and not already in use" ))?; - - let notify = Arc::new(tokio::sync::Notify::new()); - let shutdown_notify = notify.clone(); - tokio::select! { - _ = pg_server::start_pg(notify.clone(), ctx, tcp_pg) => {}, - _ = axum::serve(tcp, service).with_graceful_shutdown(async move { - shutdown_notify.notified().await; - }) => {}, - _ = tokio::signal::ctrl_c() => { - println!("Shutting down servers..."); - notify.notify_waiters(); // Notify all tasks + socket2::SockRef::from(&tcp).set_nodelay(true)?; + log::info!("Starting SpacetimeDB listening on {}", tcp.local_addr()?); + + if let Some(pg_port) = pg_port { + let server_addr = listen_addr.split(':').next().unwrap(); + let tcp_pg = TcpListener::bind(format!("{server_addr}:{pg_port}")).await.context(format!( + "failed to bind the SpacetimeDB PostgreSQL wire protocol server to {server_addr}:{pg_port}, please check that the port is valid and not already in use" + ))?; + + let notify = Arc::new(tokio::sync::Notify::new()); + let shutdown_notify = notify.clone(); + tokio::select! { + _ = pg_server::start_pg(notify.clone(), ctx, tcp_pg) => {}, + _ = axum::serve(tcp, service).with_graceful_shutdown(async move { + shutdown_notify.notified().await; + }) => {}, + _ = tokio::signal::ctrl_c() => { + println!("Shutting down servers..."); + notify.notify_waiters(); // Notify all tasks + } } + } else { + log::warn!("PostgreSQL wire protocol server disabled"); + axum::serve(tcp, service) + .with_graceful_shutdown(async { + tokio::signal::ctrl_c().await.expect("failed to install Ctrl+C handler"); + log::info!("Shutting down server..."); + }) + .await?; } - } else { - log::warn!("PostgreSQL wire protocol server disabled"); - axum::serve(tcp, service) - .with_graceful_shutdown(async { - tokio::signal::ctrl_c().await.expect("failed to install Ctrl+C handler"); - log::info!("Shutting down server..."); - }) - .await?; } + #[cfg(not(simulation))] Ok(()) } @@ -302,6 +329,7 @@ pub async fn exec(args: &ArgMatches, db_cores: JobCores) -> anyhow::Result<()> { /// Note: There is a small race condition between this check and the actual bind - /// another process could grab the port in between. This is unlikely in practice /// and the actual bind will fail with a clear error if it happens. +#[cfg(not(simulation))] pub fn is_port_available(host: &str, port: u16) -> bool { let requested = match parse_host(host) { Some(r) => r, @@ -336,11 +364,13 @@ pub fn is_port_available(host: &str, port: u16) -> bool { } #[derive(Debug, Clone, Copy)] +#[cfg(not(simulation))] enum RequestedHost { Localhost, Ip(IpAddr), } +#[cfg(not(simulation))] fn parse_host(host: &str) -> Option { let host = host.trim(); @@ -354,6 +384,7 @@ fn parse_host(host: &str) -> Option { host.parse::().ok().map(RequestedHost::Ip) } +#[cfg(not(simulation))] fn conflicts(requested: RequestedHost, listener_addr: IpAddr) -> bool { match requested { RequestedHost::Localhost => match listener_addr { @@ -424,6 +455,7 @@ fn conflicts(requested: RequestedHost, listener_addr: IpAddr) -> bool { /// Find an available port starting from the requested port. /// Returns the first port that is available on both IPv4 and IPv6. +#[cfg(not(simulation))] fn find_available_port(host: &str, requested_port: u16, max_attempts: u16) -> Option { for offset in 0..max_attempts { let port = requested_port.saturating_add(offset); @@ -438,6 +470,7 @@ fn find_available_port(host: &str, requested_port: u16, max_attempts: u16) -> Op } /// Prompt the user with a yes/no question. Returns true if they answer yes. +#[cfg(not(simulation))] fn prompt_yes_no(question: &str) -> bool { print!("{} [y/N] ", question); io::stdout().flush().ok(); diff --git a/tools/ci/README.md b/tools/ci/README.md index 9b71b406fef..23bcfc6c23b 100644 --- a/tools/ci/README.md +++ b/tools/ci/README.md @@ -239,6 +239,17 @@ Usage: docs - `--help`: Print help +### `io-boundary` + +**Usage:** +```bash +Usage: io-boundary +``` + +**Options:** + +- `--help`: Print help + ### `help` **Usage:** diff --git a/tools/ci/src/main.rs b/tools/ci/src/main.rs index 2454ea3349f..99a9a2b86c3 100644 --- a/tools/ci/src/main.rs +++ b/tools/ci/src/main.rs @@ -161,6 +161,8 @@ enum CiCmd { VersionUpgradeCheck, /// Builds the docs site. Docs, + /// Checks that runtime is not used as a Tokio-shaped IO facade. + IoBoundary, } fn run_all_clap_subcommands(skips: &[String]) -> Result<()> { @@ -189,6 +191,99 @@ fn tracked_rs_files_under(path: &str) -> Result> { .collect()) } +fn check_io_boundary() -> Result<()> { + ensure_repo_root()?; + + let mut violations = Vec::new(); + for root in ["crates/runtime", "crates/datastore", "crates/core", "crates/commitlog"] { + for path in tracked_rs_files_under(root)? { + check_file_for_runtime_io_facade(&path, &mut violations)?; + } + } + + if violations.is_empty() { + return Ok(()); + } + + for violation in &violations { + eprintln!("{violation}"); + } + bail!( + "spacetimedb_runtime must not be used as a Tokio-shaped io/fs/net facade; use Tokio directly in normal-only code and semantic seams for simulation code" + ); +} + +fn check_file_for_runtime_io_facade(path: &Path, violations: &mut Vec) -> Result<()> { + let contents = fs::read_to_string(path)?; + let mut in_runtime_use_tree = false; + + for (line_idx, line) in contents.lines().enumerate() { + let line_no = line_idx + 1; + let code = line.split("//").next().unwrap_or(line); + + for module in ["io", "fs", "net", "blocking_fs"] { + if code.contains(&format!("spacetimedb_runtime::{module}")) { + violations.push(format!( + "{}:{line_no}: spacetimedb_runtime::{module} facade usage", + path.display() + )); + } + if path == Path::new("crates/runtime/src/lib.rs") && code.contains(&format!("pub mod {module}")) { + violations.push(format!( + "{}:{line_no}: spacetimedb_runtime::{module} facade export", + path.display() + )); + } + } + + if in_runtime_use_tree { + for module in ["io", "fs", "net", "blocking_fs"] { + if use_tree_mentions_token(code, module) { + violations.push(format!( + "{}:{line_no}: spacetimedb_runtime::{module} facade import", + path.display() + )); + } + } + if code.contains("};") { + in_runtime_use_tree = false; + } + continue; + } + + if code.contains("use spacetimedb_runtime::{") { + for module in ["io", "fs", "net", "blocking_fs"] { + if use_tree_mentions_token(code, module) { + violations.push(format!( + "{}:{line_no}: spacetimedb_runtime::{module} facade import", + path.display() + )); + } + } + if !code.contains("};") { + in_runtime_use_tree = true; + } + } + } + + Ok(()) +} + +fn use_tree_mentions_token(code: &str, forbidden: &str) -> bool { + let mut token = String::new(); + for ch in code.chars() { + if ch == '_' || ch.is_ascii_alphanumeric() { + token.push(ch); + continue; + } + if token == forbidden { + return true; + } + token.clear(); + } + token == forbidden +} + fn run_publish_checks() -> Result<()> { cmd!("bash", "-lc", "test -d venv || python3 -m venv venv").run()?; cmd!("venv/bin/pip3", "install", "argparse", "toml").run()?; @@ -352,6 +447,7 @@ fn main() -> Result<()> { Some(CiCmd::Lint) => { ensure_repo_root()?; + check_io_boundary()?; // `cargo fmt --all` only checks files that Cargo discovers through workspace/package targets. // However, we also keep Rust sources in a locations that are tracked but not part of our workspace, // so this approach properly catches all the files, where `cargo fmt` does not. @@ -540,6 +636,10 @@ fn main() -> Result<()> { run_docs_build()?; } + Some(CiCmd::IoBoundary) => { + check_io_boundary()?; + } + None => run_all_clap_subcommands(&cli.skip)?, }