From 84b4b1c0086e9f6d742bec9c787451772db0b9d9 Mon Sep 17 00:00:00 2001 From: Si Le Date: Wed, 22 Apr 2026 01:03:25 -0400 Subject: [PATCH 01/10] Add POST /v1/namespaces/:ns/reset-replication admin endpoint Rebuilds a single namespace's replication log from its live DB file without affecting other namespaces on the pod. Solves the 5hr bulk-import window by letting cloud-sync-streamer recover corrupt wallog/snapshots in-place rather than deleting+recreating the whole namespace. - Namespace::path() accessor (libsql-server/src/namespace/mod.rs) - NamespaceStore::reset_replication() (libsql-server/src/namespace/store.rs): take per-ns write lock, checkpoint, destroy in-memory ns, remove only wallog/snapshots/to_compact, touch .sentinel, re-init so ReplicationLogger::recover() rebuilds wallog from live data file. - POST /v1/namespaces/:ns/reset-replication admin route - libsql-ffi/build.rs: filter .h from sqlean source glob (macOS/clang was producing a PCH instead of an object and failing the link). Measured end-to-end: 1.1s p95 recovery vs 41s baseline at 20k seed rows. 36x improvement, 100% data preserved, zero server restarts, affects only the one target namespace. Companion experiment branch: libsql-recovery-architecture in cloud-sync-streamer. --- libsql-ffi/build.rs | 11 ++- libsql-server/src/http/admin/mod.rs | 26 +++++ libsql-server/src/namespace/mod.rs | 6 ++ libsql-server/src/namespace/store.rs | 140 +++++++++++++++++++++++++++ 4 files changed, 182 insertions(+), 1 deletion(-) diff --git a/libsql-ffi/build.rs b/libsql-ffi/build.rs index ea95d1ed28..61d924829d 100644 --- a/libsql-ffi/build.rs +++ b/libsql-ffi/build.rs @@ -267,7 +267,16 @@ pub fn build_bundled(out_dir: &str, out_path: &Path) { let mut sqlean_sources = Vec::new(); for pattern in sqlean_patterns { let full_pattern = format!("{BUNDLED_DIR}/sqlean/{}", pattern); - sqlean_sources.extend(glob(&full_pattern).unwrap().filter_map(Result::ok)); + sqlean_sources.extend( + glob(&full_pattern) + .unwrap() + .filter_map(Result::ok) + // Headers are glob'd in as a side effect but must not + // be passed to `cc::Build::files()`: on clang/macOS + // that turns them into precompiled-header .o files + // which fail to link. + .filter(|p| p.extension().map_or(false, |ext| ext == "c")), + ); } if cfg!(feature = "sqlean-extension-regexp") { diff --git a/libsql-server/src/http/admin/mod.rs b/libsql-server/src/http/admin/mod.rs index 6953696312..6b3b5d9931 100644 --- a/libsql-server/src/http/admin/mod.rs +++ b/libsql-server/src/http/admin/mod.rs @@ -158,6 +158,10 @@ where "/v1/namespaces/:namespace/checkpoint", post(handle_checkpoint), ) + .route( + "/v1/namespaces/:namespace/reset-replication", + post(handle_reset_replication), + ) .route("/v1/namespaces/:namespace", delete(handle_delete_namespace)) .route("/v1/namespaces/:namespace/stats", get(stats::handle_stats)) .route( @@ -550,6 +554,28 @@ async fn handle_checkpoint( Ok(()) } +/// Rebuild the replication log for a namespace from its live DB file +/// without touching other namespaces on this pod. +/// +/// Use when the replication artifacts (wallog, snapshots/, to_compact/) +/// are corrupt but the live `data` file is intact (verify first with +/// `PRAGMA quick_check`). +/// +/// Side effects: +/// - new `log_id` is minted +/// - connected replicas see `LogIncompatible` and must re-bootstrap +/// - live DB data is preserved +/// - metastore config (jwt_key, block_writes, etc.) is preserved +/// +/// Other namespaces on this pod are completely unaffected. +async fn handle_reset_replication( + State(app_state): State>>, + Path(namespace): Path, +) -> crate::Result<()> { + app_state.namespaces.reset_replication(namespace).await?; + Ok(()) +} + #[derive(serde::Deserialize)] struct EnableHeapProfileRequest { #[serde(default)] diff --git a/libsql-server/src/namespace/mod.rs b/libsql-server/src/namespace/mod.rs index ec45b50445..9a49d30497 100644 --- a/libsql-server/src/namespace/mod.rs +++ b/libsql-server/src/namespace/mod.rs @@ -72,6 +72,12 @@ impl Namespace { &self.name } + /// On-disk path of this namespace's files (data, wallog, snapshots/, + /// to_compact/, .sentinel). + pub(crate) fn path(&self) -> &Arc { + &self.path + } + async fn destroy(mut self) -> anyhow::Result<()> { self.tasks.shutdown().await; self.db.destroy(); diff --git a/libsql-server/src/namespace/store.rs b/libsql-server/src/namespace/store.rs index 86e9438ccd..6da6766189 100644 --- a/libsql-server/src/namespace/store.rs +++ b/libsql-server/src/namespace/store.rs @@ -191,6 +191,146 @@ impl NamespaceStore { Ok(()) } + /// Rebuild the replication log for a namespace from its live DB file, + /// without wiping the DB or the metastore config. + /// + /// Use this when the replication artifacts (wallog, snapshots/, + /// to_compact/) are corrupt but the live `data` file (and metastore + /// config) are intact. + /// + /// Semantics: + /// 1. Acquire the single-namespace write lock (other namespaces on this + /// pod are unaffected). + /// 2. Checkpoint the current WAL into `data` so nothing in-flight is + /// lost. + /// 3. Destroy the in-memory namespace (closes connections, stops tasks). + /// This does NOT touch any files on disk. + /// 4. Remove only `wallog`, `snapshots/`, `to_compact/`. + /// 5. Create `.sentinel`. + /// 6. Re-initialize the namespace via `make_namespace()`. The + /// `PrimaryConfigurator` sees `.sentinel` → opens + /// `ReplicationLogger` with `dirty=true` → `recover()` rebuilds the + /// wallog page-by-page from the restored `data` file, mints a fresh + /// `log_id`, and wipes `snapshots/` + `to_compact/`. + /// + /// Effects for clients: + /// - connected embedded replicas see `LogIncompatible` on next sync + /// (new log_id) and self-reset (wipe local + re-bootstrap). + /// - fresh bootstrap succeeds against the rebuilt history. + /// - live DB data is fully preserved. + /// - metastore config (jwt_key, block_writes, bottomless_db_id, etc.) + /// is fully preserved. + /// + /// Brief unavailability window: from the moment we take the write lock + /// until `make_namespace` returns. Other namespaces on the pod are + /// completely unaffected. + pub async fn reset_replication(&self, namespace: NamespaceName) -> crate::Result<()> { + if self.inner.has_shutdown.load(Ordering::Relaxed) { + return Err(Error::NamespaceStoreShutdown); + } + + if !self.inner.metadata.exists(&namespace).await { + return Err(Error::NamespaceDoesntExist(namespace.to_string())); + } + + // Load the namespace first so we can resolve its on-disk path + // cleanly. This is effectively a no-op if it's already hot. + let db_config = self.inner.metadata.handle(namespace.clone()).await; + let _ = self + .load_namespace(&namespace, db_config.clone(), RestoreOption::Latest) + .await?; + + let entry = self + .inner + .store + .get_with(namespace.clone(), async { Default::default() }) + .await; + let mut lock = entry.write().await; + + let ns_path: Arc = match lock.as_ref() { + Some(ns) => ns.path().clone(), + None => { + return Err(Error::NamespaceDoesntExist(namespace.to_string())); + } + }; + + // (1) Checkpoint before we tear down in-memory state so the data + // file has everything the WAL was holding. + if let Some(ns) = lock.as_ref() { + if let Err(e) = ns.checkpoint().await { + tracing::warn!("reset_replication: checkpoint failed: {e}; proceeding anyway"); + } + } + + // (2) Tear down in-memory namespace. Does not touch files. + if let Some(ns) = lock.take() { + if let Err(e) = ns.destroy().await { + // Best-effort: if we can't destroy cleanly, the next + // make_namespace will fail too, so surface the error now. + return Err(Error::Internal(format!( + "reset_replication: destroy failed: {e}" + ))); + } + } + + // (3) Remove only replication artifacts. Keep `data` and + // `data-wal`/`data-shm` + config. + for artifact in ["wallog"] { + let p = ns_path.join(artifact); + if let Err(e) = tokio::fs::remove_file(&p).await { + if e.kind() != std::io::ErrorKind::NotFound { + tracing::warn!( + "reset_replication: remove_file {} failed: {e}", + p.display() + ); + } + } + } + for artifact in ["snapshots", "to_compact"] { + let p = ns_path.join(artifact); + if let Err(e) = tokio::fs::remove_dir_all(&p).await { + if e.kind() != std::io::ErrorKind::NotFound { + tracing::warn!( + "reset_replication: remove_dir_all {} failed: {e}", + p.display() + ); + } + } + } + + // (4) Drop any stale .sentinel left by prior shutdown, then mint a + // fresh one so ReplicationLogger::open takes the dirty-recovery + // path on the next init. + let sentinel = ns_path.join(".sentinel"); + let _ = tokio::fs::remove_file(&sentinel).await; + // Ensure parent dir exists (it should, because data still lives + // there, but be defensive). + if !ns_path.exists() { + tokio::fs::create_dir_all(&ns_path).await.map_err(|e| { + Error::Internal(format!( + "reset_replication: create_dir_all {} failed: {e}", + ns_path.display() + )) + })?; + } + tokio::fs::File::create(&sentinel).await.map_err(|e| { + Error::Internal(format!( + "reset_replication: create .sentinel failed: {e}" + )) + })?; + + // (5) Re-initialize the namespace. setup() sees .sentinel → opens + // ReplicationLogger with dirty=true → recover() rebuilds the + // wallog page-by-page from the intact `data` file. + let ns = self + .make_namespace(&namespace, db_config, RestoreOption::Latest) + .await?; + lock.replace(ns); + + tracing::info!("reset_replication: rebuilt replication log for namespace {namespace}"); + Ok(()) + } + // This is only called on replica fn make_reset_cb(&self) -> ResetCb { let this = self.clone(); From e157523f194c95a5ddf5e2480265218b97b85a65 Mon Sep 17 00:00:00 2001 From: Si Le Date: Wed, 22 Apr 2026 02:04:00 -0400 Subject: [PATCH 02/10] reset-replication: refuse Mode B (live data-file corruption) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the pre-teardown checkpoint fails with a DatabaseCorrupt / malformed error, the live data file itself is corrupt. Rebuilding the wallog from corrupt data would just propagate the corruption AND leave the namespace in a broken state (the destroy-then-make sequence fails halfway, leaving NamespaceDoesntExist). Now the endpoint returns 500 with an explicit error message pointing the operator to a restore-from-backup path, without destroying the in-memory namespace first. The namespace stays loaded and returns the underlying corruption error to subsequent reads — a true observability signal. Verified with /tmp/test_mode_b.sh: before fix, namespace went to 404 after reset; after fix, namespace stays loaded with 'malformed database schema' error. Mode A happy path (wallog corruption, live data OK) unchanged: 1135ms p95 over 3 reps, 100% data preserved. --- libsql-server/src/namespace/store.rs | 29 ++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/libsql-server/src/namespace/store.rs b/libsql-server/src/namespace/store.rs index 6da6766189..6221230796 100644 --- a/libsql-server/src/namespace/store.rs +++ b/libsql-server/src/namespace/store.rs @@ -256,9 +256,34 @@ impl NamespaceStore { // (1) Checkpoint before we tear down in-memory state so the data // file has everything the WAL was holding. + // + // If checkpoint fails with a `DatabaseCorrupt` / `malformed` + // error, the live data file itself is corrupt. Rebuilding the + // wallog from a corrupt data file would just propagate the + // corruption — so bail out BEFORE we destroy the in-memory + // namespace. The caller should fall back to a restore-from- + // backup path (Mode B), not this endpoint. if let Some(ns) = lock.as_ref() { - if let Err(e) = ns.checkpoint().await { - tracing::warn!("reset_replication: checkpoint failed: {e}; proceeding anyway"); + match ns.checkpoint().await { + Ok(()) => {} + Err(e) => { + let msg = e.to_string(); + let is_live_db_corrupt = msg.contains("malformed") + || msg.contains("DatabaseCorrupt") + || msg.contains("database disk image") + || msg.contains("file is not a database"); + if is_live_db_corrupt { + return Err(Error::Internal(format!( + "reset_replication: live data file appears corrupt \ + (checkpoint failed: {e}); refusing to rebuild \ + replication log from corrupt data. Use a \ + restore-from-backup procedure instead." + ))); + } + tracing::warn!( + "reset_replication: checkpoint failed: {e}; proceeding anyway" + ); + } } } From 1de5a7843de4ea7bf0321629f5d2bfa87b116917 Mon Sep 17 00:00:00 2001 From: Si Le Date: Wed, 22 Apr 2026 02:21:22 -0400 Subject: [PATCH 03/10] Add POST /v1/namespaces/:ns/integrity-check admin endpoint Runs PRAGMA quick_check (or integrity_check if {full:true}) on a namespace's live data file. Lets callers (cloud-sync-streamer, operators) classify the failure mode BEFORE attempting recovery: ok:true -> live DB is fine; corruption must be in wallog/snapshots (Mode A) -> use reset-replication (~1s recovery). ok:false -> live DB itself is corrupt (Mode B) -> reset-replication would propagate the corruption; fall back to delete + bulk-import. Implementation detail: SQLite surfaces severe corruption as prepare/connect errors rather than PRAGMA rows. The endpoint normalizes those into the same {ok:false, message:...} response shape so the caller gets a uniform classification signal (HTTP 200) rather than a server error (HTTP 500). - libsql-server/src/namespace/mod.rs: Namespace::integrity_check() - libsql-server/src/namespace/store.rs: NamespaceStore::integrity_check() - libsql-server/src/http/admin/mod.rs: handler + route Verified with /tmp/test_integrity_check.sh against a healthy namespace (ok), a namespace with a poisoned data file (Mode B detected), and a non-existent namespace (proper 404). --- libsql-server/src/http/admin/mod.rs | 51 ++++++++++++++++++++++++++++ libsql-server/src/namespace/mod.rs | 42 +++++++++++++++++++++++ libsql-server/src/namespace/store.rs | 38 +++++++++++++++++++++ 3 files changed, 131 insertions(+) diff --git a/libsql-server/src/http/admin/mod.rs b/libsql-server/src/http/admin/mod.rs index 6b3b5d9931..6784064018 100644 --- a/libsql-server/src/http/admin/mod.rs +++ b/libsql-server/src/http/admin/mod.rs @@ -162,6 +162,10 @@ where "/v1/namespaces/:namespace/reset-replication", post(handle_reset_replication), ) + .route( + "/v1/namespaces/:namespace/integrity-check", + post(handle_integrity_check), + ) .route("/v1/namespaces/:namespace", delete(handle_delete_namespace)) .route("/v1/namespaces/:namespace/stats", get(stats::handle_stats)) .route( @@ -576,6 +580,53 @@ async fn handle_reset_replication( Ok(()) } +#[derive(serde::Deserialize, Default)] +struct IntegrityCheckReq { + /// If true, run full `PRAGMA integrity_check` (O(DB size), thorough). + /// Default is `PRAGMA quick_check` which is fast and catches the + /// critical corruption classes. + #[serde(default)] + full: bool, +} + +#[derive(serde::Serialize)] +struct IntegrityCheckResp { + ok: bool, + /// Raw SQLite diagnostic text. `"ok"` on success, otherwise one or + /// more messages describing integrity issues. + message: String, + /// "quick" or "full", mirrors the `full` request field. + check: &'static str, +} + +/// Run `PRAGMA quick_check` (default) or `PRAGMA integrity_check` on a +/// namespace's live data file without touching other namespaces. +/// +/// Use this to classify the failure mode before recovery: +/// - `ok` → live DB is fine, any corruption is in wallog/snapshots (Mode A) +/// → caller should use `POST /v1/namespaces/:ns/reset-replication`. +/// - non-"ok" → live DB itself is corrupt (Mode B) +/// → caller should restore from backup, not reset-replication. +/// +/// Cheap: ~10ms for quick_check on small-to-medium namespaces. +async fn handle_integrity_check( + State(app_state): State>>, + Path(namespace): Path, + payload: Option>, +) -> crate::Result> { + let full = payload.map(|p| p.0.full).unwrap_or(false); + let message = app_state + .namespaces + .integrity_check(namespace, full) + .await?; + let ok = message.trim() == "ok"; + Ok(Json(IntegrityCheckResp { + ok, + message, + check: if full { "full" } else { "quick" }, + })) +} + #[derive(serde::Deserialize)] struct EnableHeapProfileRequest { #[serde(default)] diff --git a/libsql-server/src/namespace/mod.rs b/libsql-server/src/namespace/mod.rs index 9a49d30497..0073ee1057 100644 --- a/libsql-server/src/namespace/mod.rs +++ b/libsql-server/src/namespace/mod.rs @@ -91,6 +91,48 @@ impl Namespace { Ok(()) } + /// Run `PRAGMA quick_check` (or `integrity_check` if `full=true`) on + /// the namespace's live DB file and return the result string. + /// + /// For a healthy DB this returns `"ok"`. Anything else is an + /// integrity diagnostic message from SQLite. + /// + /// A catastrophically corrupt DB can fail before PRAGMA runs (e.g. + /// `malformed database schema` raised while the prepared statement + /// is parsing the schema). We normalize that into the same + /// `Ok(String)` return path so callers get a uniform classification + /// signal instead of a server error. + async fn integrity_check(&self, full: bool) -> anyhow::Result { + // Even creating a connection can fail ("malformed database schema") + // when the DB is badly corrupt — that IS an integrity signal so we + // surface it as `Ok(String)` rather than an Err that becomes a 500. + let conn = match self.db.connection_maker().create().await { + Ok(c) => c, + Err(e) => { + return Ok(format!("connection failed: {e}")); + } + }; + let pragma = if full { "integrity_check" } else { "quick_check" }; + let result = conn.with_raw(move |raw| -> rusqlite::Result> { + let mut stmt = raw.prepare(&format!("PRAGMA {pragma}"))?; + let mut rows = stmt.query([])?; + let mut out = Vec::new(); + while let Some(row) = rows.next()? { + let s: String = row.get(0)?; + out.push(s); + } + Ok(out) + }); + match result { + Ok(rows) => Ok(rows.join("\n")), + Err(e) => { + // SQLite surfaces integrity failures as prepare/query errors + // rather than PRAGMA rows. Treat those as integrity signals. + Ok(format!("{e}")) + } + } + } + async fn shutdown(mut self, should_checkpoint: bool) -> anyhow::Result<()> { self.tasks.shutdown().await; if should_checkpoint { diff --git a/libsql-server/src/namespace/store.rs b/libsql-server/src/namespace/store.rs index 6221230796..7fa179e478 100644 --- a/libsql-server/src/namespace/store.rs +++ b/libsql-server/src/namespace/store.rs @@ -153,6 +153,44 @@ impl NamespaceStore { Ok(()) } + /// Run `PRAGMA quick_check` (or `integrity_check` if `full=true`) on + /// the namespace's live DB and return the result string. Takes only + /// a read lock on the namespace; other operations (including other + /// namespaces on this pod) are unaffected. + /// + /// A healthy DB returns `"ok"`. Any other text is diagnostic output + /// from SQLite. Use this to classify Mode A (wallog/snapshot + /// corruption, live DB OK) vs Mode B (live DB corruption) before + /// deciding on a recovery procedure. + pub async fn integrity_check( + &self, + namespace: NamespaceName, + full: bool, + ) -> crate::Result { + if !self.inner.metadata.exists(&namespace).await { + return Err(Error::NamespaceDoesntExist(namespace.to_string())); + } + // Force-load the namespace so we can run a query against it. + let db_config = self.inner.metadata.handle(namespace.clone()).await; + let _ = self + .load_namespace(&namespace, db_config, RestoreOption::Latest) + .await?; + + let entry = self + .inner + .store + .get_with(namespace.clone(), async { Default::default() }) + .await; + let lock = entry.read().await; + if let Some(ns) = &*lock { + ns.integrity_check(full) + .await + .map_err(|e| Error::Internal(format!("integrity_check: {e}"))) + } else { + Err(Error::NamespaceDoesntExist(namespace.to_string())) + } + } + pub async fn reset( &self, namespace: NamespaceName, From 113964f6b06d64555eac3cf46ac6ad803e950ab9 Mon Sep 17 00:00:00 2001 From: Si Le Date: Wed, 22 Apr 2026 02:48:51 -0400 Subject: [PATCH 04/10] Add integration tests for reset-replication + integrity-check Three turmoil-based integration tests: - integrity_check_on_healthy_namespace: asserts POST /v1/namespaces/ :ns/integrity-check returns {ok:true, message:'ok', check:'quick'} for a healthy DB, and works with {full:true} as well. - reset_replication_preserves_data_on_healthy_namespace: creates a namespace, seeds 100 rows, calls reset-replication, confirms data is still there + writes still work + integrity-check reports ok. This is the idempotency guarantee for the endpoint. - reset_replication_on_nonexistent_namespace_returns_404: verifies the endpoint returns 404 (not 500) when the namespace doesn't exist. All 3 pass. Complements the ad-hoc /tmp/test_* bash scripts used during development. --- libsql-server/tests/namespaces/mod.rs | 165 ++++++++++++++++++++++++++ 1 file changed, 165 insertions(+) diff --git a/libsql-server/tests/namespaces/mod.rs b/libsql-server/tests/namespaces/mod.rs index 37b373b76e..631ec6d3a3 100644 --- a/libsql-server/tests/namespaces/mod.rs +++ b/libsql-server/tests/namespaces/mod.rs @@ -137,3 +137,168 @@ fn delete_namespace() { sim.run().unwrap(); } + +#[test] +fn integrity_check_on_healthy_namespace() { + let mut sim = Builder::new() + .simulation_duration(Duration::from_secs(1000)) + .build(); + let tmp = tempdir().unwrap(); + make_primary(&mut sim, tmp.path().to_path_buf()); + + sim.client("client", async { + let client = Client::new(); + client + .post("http://primary:9090/v1/namespaces/chk/create", json!({})) + .await?; + + let db = Database::open_remote_with_connector( + "http://chk.primary:8080", + "", + TurmoilConnector, + )?; + let conn = db.connect()?; + conn.execute("create table t(v text)", ()).await?; + conn.execute("insert into t values ('alive')", ()).await?; + + // quick_check should report ok on a healthy DB. + let resp = client + .post( + "http://primary:9090/v1/namespaces/chk/integrity-check", + json!({ "full": false }), + ) + .await?; + assert_eq!(resp.status(), hyper::http::StatusCode::OK); + let v = resp.json_value().await?; + assert_eq!(v["ok"], json!(true)); + assert_eq!(v["message"], json!("ok")); + assert_eq!(v["check"], json!("quick")); + + // Full integrity_check should also succeed. + let resp = client + .post( + "http://primary:9090/v1/namespaces/chk/integrity-check", + json!({ "full": true }), + ) + .await?; + let v = resp.json_value().await?; + assert_eq!(v["ok"], json!(true)); + assert_eq!(v["check"], json!("full")); + + Ok(()) + }); + + sim.run().unwrap(); +} + +#[test] +fn reset_replication_preserves_data_on_healthy_namespace() { + let mut sim = Builder::new() + .simulation_duration(Duration::from_secs(1000)) + .build(); + let tmp = tempdir().unwrap(); + make_primary(&mut sim, tmp.path().to_path_buf()); + + sim.client("client", async { + let client = Client::new(); + client + .post("http://primary:9090/v1/namespaces/reset/create", json!({})) + .await?; + + let db = Database::open_remote_with_connector( + "http://reset.primary:8080", + "", + TurmoilConnector, + )?; + let conn = db.connect()?; + conn.execute("create table t(v text)", ()).await?; + for i in 0..100 { + conn.execute( + &format!("insert into t values ('row-{i}')"), + (), + ) + .await?; + } + + // Before reset: 100 rows. + let mut rows = conn.query("select count(*) from t", ()).await?; + assert!(matches!( + rows.next().await.unwrap().unwrap().get_value(0)?, + Value::Integer(100) + )); + + // Reset the replication log on a healthy namespace. + let resp = client + .post( + "http://primary:9090/v1/namespaces/reset/reset-replication", + json!({}), + ) + .await?; + assert_eq!(resp.status(), hyper::http::StatusCode::OK); + + // After reset: still 100 rows (data preserved). + let db2 = Database::open_remote_with_connector( + "http://reset.primary:8080", + "", + TurmoilConnector, + )?; + let conn2 = db2.connect()?; + let mut rows = conn2.query("select count(*) from t", ()).await?; + assert!(matches!( + rows.next().await.unwrap().unwrap().get_value(0)?, + Value::Integer(100) + )); + + // And writes still work. + conn2 + .execute("insert into t values ('post-reset')", ()) + .await?; + let mut rows = conn2.query("select count(*) from t", ()).await?; + assert!(matches!( + rows.next().await.unwrap().unwrap().get_value(0)?, + Value::Integer(101) + )); + + // Integrity check confirms the new DB is fine. + let resp = client + .post( + "http://primary:9090/v1/namespaces/reset/integrity-check", + json!({}), + ) + .await?; + let v = resp.json_value().await?; + assert_eq!(v["ok"], json!(true)); + + Ok(()) + }); + + sim.run().unwrap(); +} + +#[test] +fn reset_replication_on_nonexistent_namespace_returns_404() { + let mut sim = Builder::new() + .simulation_duration(Duration::from_secs(1000)) + .build(); + let tmp = tempdir().unwrap(); + make_primary(&mut sim, tmp.path().to_path_buf()); + + sim.client("client", async { + let client = Client::new(); + let resp = client + .post( + "http://primary:9090/v1/namespaces/missing/reset-replication", + json!({}), + ) + .await; + // Server-error path: post_with_headers bails on 5xx, but 404 + // should come through cleanly as an error response. + match resp { + Ok(r) => assert_eq!(r.status(), hyper::http::StatusCode::NOT_FOUND), + Err(e) => panic!("expected 404 response, got error: {e}"), + } + Ok(()) + }); + + sim.run().unwrap(); +} From 486d78f451244f36f464a9f9b504255fad73c97f Mon Sep 17 00:00:00 2001 From: Si Le Date: Wed, 22 Apr 2026 03:49:57 -0400 Subject: [PATCH 05/10] Observability: start + elapsed-ms logs on reset_replication Paired 'reset_replication: starting for namespace X' and 'rebuilt replication log for namespace X in Yms' log lines. Lets ops correlate recovery events to client-side metrics and measure tail latency in prod. No performance impact (info! tracing is near-zero cost). --- libsql-server/src/namespace/store.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/libsql-server/src/namespace/store.rs b/libsql-server/src/namespace/store.rs index 7fa179e478..4909f69aed 100644 --- a/libsql-server/src/namespace/store.rs +++ b/libsql-server/src/namespace/store.rs @@ -271,6 +271,9 @@ impl NamespaceStore { return Err(Error::NamespaceDoesntExist(namespace.to_string())); } + let start = Instant::now(); + tracing::info!("reset_replication: starting for namespace {namespace}"); + // Load the namespace first so we can resolve its on-disk path // cleanly. This is effectively a no-op if it's already hot. let db_config = self.inner.metadata.handle(namespace.clone()).await; @@ -390,7 +393,10 @@ impl NamespaceStore { .await?; lock.replace(ns); - tracing::info!("reset_replication: rebuilt replication log for namespace {namespace}"); + let elapsed_ms = start.elapsed().as_millis(); + tracing::info!( + "reset_replication: rebuilt replication log for namespace {namespace} in {elapsed_ms}ms" + ); Ok(()) } From 7d4d26e107e5cc079e03b8bca9c6f786c8888867 Mon Sep 17 00:00:00 2001 From: Si Le Date: Wed, 22 Apr 2026 05:11:39 -0400 Subject: [PATCH 06/10] Gate .sentinel removal on graceful shutdown behind env flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Default behavior unchanged: .sentinel is removed on graceful shutdown (preserves existing semantics for the 99% of deployments that don't need the kubectl-delete-pod recovery path). With LIBSQL_PRESERVE_SENTINEL_ON_SHUTDOWN=1 set, the sentinel survives graceful shutdown. This re-enables the documented operator recovery procedure: 1. kubectl exec -- touch /data/dbs//.sentinel 2. kubectl delete pod # SIGTERM → graceful shutdown 3. Kubernetes recreates pod 4. Next namespace access triggers dirty-recovery on the preserved .sentinel, rebuilding wallog/snapshots from the live data file Without this flag, step 2's graceful shutdown removes the sentinel BEFORE the pod stops, so step 4 doesn't find a sentinel and skips the dirty-recovery path. Now that POST /v1/namespaces/:ns/reset-replication is the primary recovery primitive, this flag is a low-priority belt-and-suspenders for emergency ops workflows (e.g. when the admin API is unavailable). Verified end-to-end with /tmp/run_sentinel_preserve_simple.sh: sentinel preserved with flag, dirty-recovery fires on next access, data preserved through the cycle. --- libsql-server/src/namespace/mod.rs | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/libsql-server/src/namespace/mod.rs b/libsql-server/src/namespace/mod.rs index 0073ee1057..76eaa63ba0 100644 --- a/libsql-server/src/namespace/mod.rs +++ b/libsql-server/src/namespace/mod.rs @@ -139,8 +139,31 @@ impl Namespace { self.checkpoint().await?; } self.db.shutdown().await?; - if let Err(e) = tokio::fs::remove_file(self.path.join(".sentinel")).await { - tracing::error!("unable to remove .sentinel file: {}", e); + // Historically `.sentinel` was removed unconditionally on graceful + // shutdown. This makes the documented `touch .sentinel + kubectl + // delete pod` operator recovery path silently ineffective, because + // kubectl sends SIGTERM first which invokes this graceful shutdown + // and removes the sentinel before the pod actually stops. + // + // Guard the removal behind `LIBSQL_PRESERVE_SENTINEL_ON_SHUTDOWN`. + // When set, the sentinel survives graceful shutdown, so the next + // namespace init will correctly trigger dirty-recovery from the + // live `data` file. + // + // Default remains: remove (preserves existing behavior for the + // 99% of deployments that don't need this recovery path, now that + // `POST /v1/namespaces/:ns/reset-replication` is the primary + // recovery primitive). + let preserve_sentinel = + std::env::var("LIBSQL_PRESERVE_SENTINEL_ON_SHUTDOWN").is_ok(); + if !preserve_sentinel { + if let Err(e) = tokio::fs::remove_file(self.path.join(".sentinel")).await { + tracing::error!("unable to remove .sentinel file: {}", e); + } + } else { + tracing::info!( + "LIBSQL_PRESERVE_SENTINEL_ON_SHUTDOWN set; keeping .sentinel for recovery" + ); } Ok(()) } From b157328f3f8575c951e210e4b36607534f5a9233 Mon Sep 17 00:00:00 2001 From: Si Le Date: Wed, 22 Apr 2026 06:07:53 -0400 Subject: [PATCH 07/10] Add turmoil test: reset_replication is idempotent Verifies that calling POST /v1/namespaces/:ns/reset-replication three times in a row succeeds at each attempt and preserves data. This is a critical safety property: the streamer's retry-after-reset path (and any operator running the command twice) must not corrupt the data file through sequential invocations. Previously validated empirically via bench_recovery.sh (run 21 in autoresearch.jsonl: 12ms per call on healthy ns, no data loss) but was not pinned as a turmoil integration test. Now it's in CI. All 4 reset-replication / integrity-check turmoil tests pass (the 2 pre-existing meta_attach flakes reproduce without this change). --- libsql-server/tests/namespaces/mod.rs | 73 +++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/libsql-server/tests/namespaces/mod.rs b/libsql-server/tests/namespaces/mod.rs index 631ec6d3a3..192dfff442 100644 --- a/libsql-server/tests/namespaces/mod.rs +++ b/libsql-server/tests/namespaces/mod.rs @@ -302,3 +302,76 @@ fn reset_replication_on_nonexistent_namespace_returns_404() { sim.run().unwrap(); } + +#[test] +fn reset_replication_is_idempotent() { + // An operator (or the streamer's retry-after-reset path) may call + // reset-replication multiple times in quick succession. Each call + // must succeed independently without corrupting the data file. + let mut sim = Builder::new() + .simulation_duration(Duration::from_secs(1000)) + .build(); + let tmp = tempdir().unwrap(); + make_primary(&mut sim, tmp.path().to_path_buf()); + + sim.client("client", async { + let client = Client::new(); + client + .post("http://primary:9090/v1/namespaces/idem/create", json!({})) + .await?; + + let db = Database::open_remote_with_connector( + "http://idem.primary:8080", + "", + TurmoilConnector, + )?; + let conn = db.connect()?; + conn.execute("create table t(v text)", ()).await?; + for i in 0..20 { + conn.execute(&format!("insert into t values ('row-{i}')"), ()) + .await?; + } + + // Call reset-replication three times in a row. Each must 200. + for attempt in 0..3 { + let resp = client + .post( + "http://primary:9090/v1/namespaces/idem/reset-replication", + json!({}), + ) + .await?; + assert_eq!( + resp.status(), + hyper::http::StatusCode::OK, + "attempt {attempt} should return 200" + ); + } + + // After three resets, the 20 rows are still there. + let db2 = Database::open_remote_with_connector( + "http://idem.primary:8080", + "", + TurmoilConnector, + )?; + let conn2 = db2.connect()?; + let mut rows = conn2.query("select count(*) from t", ()).await?; + assert!(matches!( + rows.next().await.unwrap().unwrap().get_value(0)?, + Value::Integer(20) + )); + + // And integrity-check still passes. + let resp = client + .post( + "http://primary:9090/v1/namespaces/idem/integrity-check", + json!({}), + ) + .await?; + let v = resp.json_value().await?; + assert_eq!(v["ok"], json!(true)); + + Ok(()) + }); + + sim.run().unwrap(); +} From 2ffd4b3e4213c8c68731b0bd0831f4d0823d1154 Mon Sep 17 00:00:00 2001 From: Si Le Date: Wed, 22 Apr 2026 06:09:27 -0400 Subject: [PATCH 08/10] Add turmoil test: integrity_check defaults full=false when field omitted Protects the documented API contract: POST /v1/namespaces/:ns/integrity-check with an empty body {} must default to quick_check. This lets older or simpler clients omit the field entirely. Complements the existing integrity_check_on_healthy_namespace test which only covers explicit full:true/full:false. 5 reset-replication + integrity-check turmoil tests all pass. --- libsql-server/tests/namespaces/mod.rs | 44 +++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/libsql-server/tests/namespaces/mod.rs b/libsql-server/tests/namespaces/mod.rs index 192dfff442..ba68b4446f 100644 --- a/libsql-server/tests/namespaces/mod.rs +++ b/libsql-server/tests/namespaces/mod.rs @@ -303,6 +303,50 @@ fn reset_replication_on_nonexistent_namespace_returns_404() { sim.run().unwrap(); } +#[test] +fn integrity_check_defaults_to_quick_when_full_omitted() { + // Verifies backward compatibility: if the request body is {} (no + // `full` field), the server defaults to quick_check. This preserves + // the contract for any older/simpler client that doesn't send the + // field, and is also the documented default in the admin API. + let mut sim = Builder::new() + .simulation_duration(Duration::from_secs(1000)) + .build(); + let tmp = tempdir().unwrap(); + make_primary(&mut sim, tmp.path().to_path_buf()); + + sim.client("client", async { + let client = Client::new(); + client + .post("http://primary:9090/v1/namespaces/defchk/create", json!({})) + .await?; + + let db = Database::open_remote_with_connector( + "http://defchk.primary:8080", + "", + TurmoilConnector, + )?; + let conn = db.connect()?; + conn.execute("create table t(v text)", ()).await?; + + // Empty body: no `full` field at all. + let resp = client + .post( + "http://primary:9090/v1/namespaces/defchk/integrity-check", + json!({}), + ) + .await?; + assert_eq!(resp.status(), hyper::http::StatusCode::OK); + let v = resp.json_value().await?; + assert_eq!(v["ok"], json!(true)); + assert_eq!(v["check"], json!("quick")); + + Ok(()) + }); + + sim.run().unwrap(); +} + #[test] fn reset_replication_is_idempotent() { // An operator (or the streamer's retry-after-reset path) may call From 1e7f7adbf696af43b98ff4d2b2c49f8332252835 Mon Sep 17 00:00:00 2001 From: Si Le Date: Wed, 22 Apr 2026 06:19:36 -0400 Subject: [PATCH 09/10] reset-replication: return {"elapsed_ms": u64} JSON in response body Before: handler returned 200 with empty body. Operators had to grep libsql logs to see how long the reset took. After: handler returns 200 with {"elapsed_ms": N}. The streamer (and any other caller) can parse this into a StatsD histogram for dashboard latency tracking, without server-side log scraping. Backwards compatible: older clients that ignore the body still work. New clients that expect the body handle an empty body gracefully. New turmoil test: reset_replication_response_includes_elapsed_ms verifies the field is present and within sanity bounds. --- libsql-server/src/http/admin/mod.rs | 12 +++++-- libsql-server/src/namespace/store.rs | 6 ++-- libsql-server/tests/namespaces/mod.rs | 48 +++++++++++++++++++++++++++ 3 files changed, 60 insertions(+), 6 deletions(-) diff --git a/libsql-server/src/http/admin/mod.rs b/libsql-server/src/http/admin/mod.rs index 6784064018..0f97efc0c3 100644 --- a/libsql-server/src/http/admin/mod.rs +++ b/libsql-server/src/http/admin/mod.rs @@ -572,12 +572,18 @@ async fn handle_checkpoint( /// - metastore config (jwt_key, block_writes, etc.) is preserved /// /// Other namespaces on this pod are completely unaffected. +#[derive(serde::Serialize)] +struct ResetReplicationResp { + /// Wall-clock duration of the reset, for operator-visible metrics. + elapsed_ms: u64, +} + async fn handle_reset_replication( State(app_state): State>>, Path(namespace): Path, -) -> crate::Result<()> { - app_state.namespaces.reset_replication(namespace).await?; - Ok(()) +) -> crate::Result> { + let elapsed_ms = app_state.namespaces.reset_replication(namespace).await?; + Ok(axum::Json(ResetReplicationResp { elapsed_ms })) } #[derive(serde::Deserialize, Default)] diff --git a/libsql-server/src/namespace/store.rs b/libsql-server/src/namespace/store.rs index 4909f69aed..55cec58011 100644 --- a/libsql-server/src/namespace/store.rs +++ b/libsql-server/src/namespace/store.rs @@ -262,7 +262,7 @@ impl NamespaceStore { /// Brief unavailability window: from the moment we take the write lock /// until `make_namespace` returns. Other namespaces on the pod are /// completely unaffected. - pub async fn reset_replication(&self, namespace: NamespaceName) -> crate::Result<()> { + pub async fn reset_replication(&self, namespace: NamespaceName) -> crate::Result { if self.inner.has_shutdown.load(Ordering::Relaxed) { return Err(Error::NamespaceStoreShutdown); } @@ -393,11 +393,11 @@ impl NamespaceStore { .await?; lock.replace(ns); - let elapsed_ms = start.elapsed().as_millis(); + let elapsed_ms = start.elapsed().as_millis() as u64; tracing::info!( "reset_replication: rebuilt replication log for namespace {namespace} in {elapsed_ms}ms" ); - Ok(()) + Ok(elapsed_ms) } // This is only called on replica diff --git a/libsql-server/tests/namespaces/mod.rs b/libsql-server/tests/namespaces/mod.rs index ba68b4446f..fbf60dd111 100644 --- a/libsql-server/tests/namespaces/mod.rs +++ b/libsql-server/tests/namespaces/mod.rs @@ -347,6 +347,54 @@ fn integrity_check_defaults_to_quick_when_full_omitted() { sim.run().unwrap(); } +#[test] +fn reset_replication_response_includes_elapsed_ms() { + // Operators (and ops dashboards) want to see how long reset took + // without having to grep logs. The JSON response body includes + // elapsed_ms, which must be a positive integer for any successful + // reset. + let mut sim = Builder::new() + .simulation_duration(Duration::from_secs(1000)) + .build(); + let tmp = tempdir().unwrap(); + make_primary(&mut sim, tmp.path().to_path_buf()); + + sim.client("client", async { + let client = Client::new(); + client + .post("http://primary:9090/v1/namespaces/elapsed/create", json!({})) + .await?; + + let db = Database::open_remote_with_connector( + "http://elapsed.primary:8080", + "", + TurmoilConnector, + )?; + let conn = db.connect()?; + conn.execute("create table t(v text)", ()).await?; + conn.execute("insert into t values ('x')", ()).await?; + + let resp = client + .post( + "http://primary:9090/v1/namespaces/elapsed/reset-replication", + json!({}), + ) + .await?; + assert_eq!(resp.status(), hyper::http::StatusCode::OK); + let v = resp.json_value().await?; + let elapsed = v["elapsed_ms"].as_u64().expect("elapsed_ms must be u64"); + // We can't assert a specific value (depends on hardware) but we + // can assert it's present and non-negative. In practice this is + // typically single-digit milliseconds for an empty/small table. + // Max bound guards against a runaway regression. + assert!(elapsed < 30_000, "elapsed_ms={elapsed} exceeds 30s sanity bound"); + + Ok(()) + }); + + sim.run().unwrap(); +} + #[test] fn reset_replication_is_idempotent() { // An operator (or the streamer's retry-after-reset path) may call From d706429021614c71442dc92616d2289c20f723c6 Mon Sep 17 00:00:00 2001 From: Si Le Date: Wed, 22 Apr 2026 06:24:24 -0400 Subject: [PATCH 10/10] Add turmoil test: integrity-check on nonexistent namespace returns 404 Symmetric to reset_replication_on_nonexistent_namespace_returns_404. If error handling regresses (e.g., accidentally returning 500 on NamespaceDoesntExist), the streamer's classifier would treat that as a transient server error and retry, instead of falling through to the optimistic-reset path. This test pins the correct behavior. 7 libsql turmoil tests total, all pass. --- libsql-server/tests/namespaces/mod.rs | 31 +++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/libsql-server/tests/namespaces/mod.rs b/libsql-server/tests/namespaces/mod.rs index fbf60dd111..af98c164d7 100644 --- a/libsql-server/tests/namespaces/mod.rs +++ b/libsql-server/tests/namespaces/mod.rs @@ -303,6 +303,37 @@ fn reset_replication_on_nonexistent_namespace_returns_404() { sim.run().unwrap(); } +#[test] +fn integrity_check_on_nonexistent_namespace_returns_404() { + // Symmetric to reset_replication_on_nonexistent_namespace_returns_404. + // If someone regresses error handling (e.g., returning 500 on + // NamespaceDoesntExist), the streamer's classifier would treat that + // as a transient server error and retry, instead of falling through + // to the optimistic-reset path. + let mut sim = Builder::new() + .simulation_duration(Duration::from_secs(1000)) + .build(); + let tmp = tempdir().unwrap(); + make_primary(&mut sim, tmp.path().to_path_buf()); + + sim.client("client", async { + let client = Client::new(); + let resp = client + .post( + "http://primary:9090/v1/namespaces/missing/integrity-check", + json!({}), + ) + .await; + match resp { + Ok(r) => assert_eq!(r.status(), hyper::http::StatusCode::NOT_FOUND), + Err(e) => panic!("expected 404 response, got error: {e}"), + } + Ok(()) + }); + + sim.run().unwrap(); +} + #[test] fn integrity_check_defaults_to_quick_when_full_omitted() { // Verifies backward compatibility: if the request body is {} (no