From 4e6842b873e32d4045c22e640382cea9ee32eea7 Mon Sep 17 00:00:00 2001 From: Steven Hildreth Date: Wed, 10 Jun 2026 18:07:55 -0500 Subject: [PATCH 1/9] fix: remove row limit on paged row locator cache to improve indexed equality lookups --- crates/decentdb/src/exec/mod.rs | 5 +- ...ql_indexed_seek_checkpointed_perf_tests.rs | 282 ++++++++++++++++++ docs/about/changelog.md | 15 + 3 files changed, 299 insertions(+), 3 deletions(-) create mode 100644 crates/decentdb/tests/sql_indexed_seek_checkpointed_perf_tests.rs diff --git a/crates/decentdb/src/exec/mod.rs b/crates/decentdb/src/exec/mod.rs index 597ec4b..e4f34c7 100644 --- a/crates/decentdb/src/exec/mod.rs +++ b/crates/decentdb/src/exec/mod.rs @@ -94,7 +94,6 @@ const TABLE_PAYLOAD_MAGIC: &[u8; 8] = b"DDBTBL01"; const TABLE_PAGED_MANIFEST_MAGIC: &[u8; 8] = b"DDBTPG02"; const PAGED_TABLE_TARGET_CHUNK_PAGES: usize = 16; pub(super) const PAGED_TABLE_RESIDENT_APPEND_ROW_THRESHOLD: usize = 1024; -const DEFERRED_PAGED_ROW_LOCATOR_CACHE_MAX_ROWS: usize = 250_000; const GENERATED_COLUMNS_SECTION_MAGIC: &[u8; 8] = b"DDBGCM02"; const INDEX_INCLUDE_COLUMNS_SECTION_MAGIC: &[u8; 8] = b"DDBICL1\0"; const SCHEMAS_SECTION_MAGIC: &[u8; 8] = b"DDBSCH01"; @@ -2039,8 +2038,8 @@ impl EngineRuntime { table_name: &str, state: PersistedTableState, ) -> Result> { - let needs_locator_cache = state.row_count <= DEFERRED_PAGED_ROW_LOCATOR_CACHE_MAX_ROWS - && self.should_cache_deferred_paged_row_locators(table_name); + let needs_locator_cache = + self.should_cache_deferred_paged_row_locators(table_name); if !db.config().persistent_pk_index && !needs_locator_cache { self.deferred_paged_row_locator_caches_mut() .remove(table_name); diff --git a/crates/decentdb/tests/sql_indexed_seek_checkpointed_perf_tests.rs b/crates/decentdb/tests/sql_indexed_seek_checkpointed_perf_tests.rs new file mode 100644 index 0000000..b385e82 --- /dev/null +++ b/crates/decentdb/tests/sql_indexed_seek_checkpointed_perf_tests.rs @@ -0,0 +1,282 @@ +//! Regression tests for indexed seek performance on checkpointed/paged tables. +//! +//! These tests guard against the O(manifest_size) per-row lookup regression +//! that occurs when the paged row locator cache is not built for large tables. +//! The Melodee MusicBrainz validation observed ~3.5s for indexed equality on +//! a 2.9M row checkpointed Artist table, while the same query shape on a 493K +//! row ArtistAlias table took ~260ms. +//! +//! Root cause: the `DEFERRED_PAGED_ROW_LOCATOR_CACHE_MAX_ROWS` constant +//! (250,000) prevented the paged row locator cache from being built for tables +//! exceeding this limit. Without the cache, each row-by-id lookup from a +//! secondary index reads the entire paged table manifest from disk, making +//! indexed equality O(manifest_size) per row instead of O(1). +//! +//! Fix: remove the row count limit on the paged row locator cache. The cache +//! is now built for all tables that have btree indexes or row_id alias columns, +//! regardless of row count. The memory cost is ~40 bytes per row, which is +//! acceptable for tables that are actively queried through indexes. + +use decentdb::{Db, DbConfig, Value}; +use std::time::Instant; +use tempfile::TempDir; + +fn large_table_config() -> DbConfig { + DbConfig { + paged_row_storage: true, + defer_table_materialization: true, + ..DbConfig::default() + } +} + +/// Regression test for DDB-002/DDB-003: indexed equality on a large +/// checkpointed table must use the paged row locator cache for O(1) +/// row-by-id lookups, not fall through to O(manifest_size) linear scan. +/// +/// Creates a synthetic table with enough rows to exceed the old 250,000 +/// paged row locator cache limit, checkpoints and reopens the database, +/// then verifies that indexed equality queries return correct results +/// within a bounded time. +#[test] +fn indexed_equality_on_large_checkpointed_table_is_bounded() { + let temp_dir = TempDir::new().unwrap(); + let db_path = temp_dir.path().join("large.ddb"); + let path_str = db_path.to_str().unwrap(); + + let target_row_id: i64 = 280_001; + let target_name = format!("NameNormalized_{target_row_id}"); + let target_mbid = format!("mbid-{target_row_id:08x}-fade-4beef-beef-{target_row_id:012x}"); + + // Phase 1: Create table, insert data, checkpoint, and close + { + let db = Db::open_or_create(path_str, large_table_config()).unwrap(); + db.execute( + "CREATE TABLE Artist ( + Id INT64 PRIMARY KEY, + MusicBrainzArtistId INT64, + MusicBrainzIdRaw TEXT, + NameNormalized TEXT, + SortName TEXT, + AlternateNames TEXT + )", + ) + .unwrap(); + db.execute("CREATE INDEX IX_Artist_MusicBrainzIdRaw ON Artist(MusicBrainzIdRaw)") + .unwrap(); + db.execute("CREATE INDEX IX_Artist_NameNormalized ON Artist(NameNormalized)") + .unwrap(); + + // Insert 300,000 rows using transactions (exceeds old 250,000 limit) + let row_count: i64 = 300_000; + let mut txn = db.transaction().unwrap(); + let stmt = txn.prepare( + "INSERT INTO Artist VALUES ($1, $2, $3, $4, $5, $6)" + ).unwrap(); + for i in 1..=row_count { + stmt.execute_in( + &mut txn, + &[ + Value::Int64(i), + Value::Int64(i), + Value::Text(format!("mbid-{i:08x}-fade-4beef-beef-{i:012x}")), + Value::Text(format!("NameNormalized_{i}")), + Value::Text(format!("SortName_{i}")), + Value::Text(format!("Alt1_{i};Alt2_{i};Alt3_{i}")), + ], + ) + .unwrap(); + } + txn.commit().unwrap(); + + db.checkpoint().expect("checkpoint"); + // db is dropped here when exiting the block scope + } + + // Phase 2: Reopen and test indexed seek performance + let db = Db::open_or_create(path_str, large_table_config()).unwrap(); + + let explain = db + .execute(&format!( + "EXPLAIN SELECT * FROM Artist WHERE NameNormalized = '{target_name}' ORDER BY SortName LIMIT 10" + )) + .unwrap(); + let explain_lines: Vec = explain + .explain_lines() + .iter() + .map(|s| s.to_string()) + .collect(); + assert!( + explain_lines + .iter() + .any(|line| line.contains("IndexSeek") && line.contains("ix_artist_namenormalized")), + "expected IndexSeek on ix_artist_namenormalized, got: {explain_lines:?}" + ); + + let started = Instant::now(); + let result = db + .execute(&format!( + "SELECT * FROM Artist WHERE NameNormalized = '{target_name}' ORDER BY SortName LIMIT 10" + )) + .unwrap(); + let elapsed = started.elapsed(); + + assert_eq!(result.rows().len(), 1, "expected exactly 1 matching row"); + let row = &result.rows()[0]; + assert_eq!(row.values()[3], Value::Text(target_name.clone())); + + let started2 = Instant::now(); + let result2 = db + .execute(&format!( + "SELECT * FROM Artist WHERE MusicBrainzIdRaw = '{target_mbid}' ORDER BY Id LIMIT 1" + )) + .unwrap(); + let elapsed2 = started2.elapsed(); + + assert_eq!(result2.rows().len(), 1, "expected exactly 1 matching row"); + let row2 = &result2.rows()[0]; + assert_eq!(row2.values()[2], Value::Text(target_mbid.clone())); + + assert!( + elapsed.as_secs_f64() < 5.0, + "indexed NameNormalized equality on 300K-row checkpointed table took {elapsed:?}; \ + expected bounded paged-row-locator lookup, not O(manifest_size) scan" + ); + assert!( + elapsed2.as_secs_f64() < 5.0, + "indexed MusicBrainzIdRaw equality on 300K-row checkpointed table took {elapsed2:?}; \ + expected bounded paged-row-locator lookup, not O(manifest_size) scan" + ); + // TempDir is automatically cleaned up +} + +/// Comparator: indexed equality on a small checkpointed table should also +/// be fast. This proves the indexed path works correctly for both small +/// and large tables after checkpoint/reopen. +#[test] +fn indexed_equality_on_small_checkpointed_table_is_correct() { + let temp_dir = TempDir::new().unwrap(); + let db_path = temp_dir.path().join("small.ddb"); + let path_str = db_path.to_str().unwrap(); + + // Phase 1: Create, insert, checkpoint, close + { + let db = Db::open_or_create(path_str, large_table_config()).unwrap(); + db.execute( + "CREATE TABLE Item ( + Id INT64 PRIMARY KEY, + LookupKey TEXT, + Payload TEXT + )", + ) + .unwrap(); + db.execute("CREATE INDEX IX_Item_LookupKey ON Item(LookupKey)") + .unwrap(); + + let mut txn = db.transaction().unwrap(); + let stmt = txn.prepare("INSERT INTO Item VALUES ($1, $2, $3)").unwrap(); + for i in 1..=10_000 { + stmt.execute_in( + &mut txn, + &[ + Value::Int64(i), + Value::Text(format!("key_{i:06}")), + Value::Text(format!("payload_data_{i}")), + ], + ) + .unwrap(); + } + txn.commit().unwrap(); + db.checkpoint().expect("checkpoint"); + } + + // Phase 2: Reopen and test + let db = Db::open_or_create(path_str, large_table_config()).unwrap(); + + let result = db + .execute("SELECT * FROM Item WHERE LookupKey = 'key_009999' ORDER BY Id LIMIT 1") + .unwrap(); + assert_eq!(result.rows().len(), 1); + assert_eq!( + result.rows()[0].values()[1], + Value::Text("key_009999".to_string()) + ); + assert_eq!( + result.rows()[0].values()[2], + Value::Text("payload_data_9999".to_string()) + ); + + let result_miss = db + .execute("SELECT * FROM Item WHERE LookupKey = 'key_999999' ORDER BY Id LIMIT 1") + .unwrap(); + assert_eq!(result_miss.rows().len(), 0); + // TempDir is automatically cleaned up +} + +/// Verifies that EXPLAIN reports IndexSeek for the indexed equality shape +/// on a checkpointed table, proving the planner correctly selects the +/// index path even when the table is deferred/paged. +#[test] +fn explain_reports_index_seek_for_checkpointed_table() { + let temp_dir = TempDir::new().unwrap(); + let db_path = temp_dir.path().join("explain.ddb"); + let path_str = db_path.to_str().unwrap(); + + // Phase 1: Create, insert, checkpoint, close + { + let db = Db::open_or_create(path_str, large_table_config()).unwrap(); + db.execute( + "CREATE TABLE Probe ( + Id INT64 PRIMARY KEY, + Code TEXT, + Label TEXT + )", + ) + .unwrap(); + db.execute("CREATE INDEX IX_Probe_Code ON Probe(Code)") + .unwrap(); + + let mut txn = db.transaction().unwrap(); + let stmt = txn.prepare("INSERT INTO Probe VALUES ($1, $2, $3)").unwrap(); + for i in 1..=5_000 { + stmt.execute_in( + &mut txn, + &[ + Value::Int64(i), + Value::Text(format!("code_{i:06}")), + Value::Text(format!("label_{i}")), + ], + ) + .unwrap(); + } + txn.commit().unwrap(); + db.checkpoint().expect("checkpoint"); + } + + // Phase 2: Reopen and test + let db = Db::open_or_create(path_str, large_table_config()).unwrap(); + + let explain = db + .execute("EXPLAIN SELECT * FROM Probe WHERE Code = 'code_002500' ORDER BY Id LIMIT 1") + .unwrap(); + let lines: Vec = explain + .explain_lines() + .iter() + .map(|s| s.to_string()) + .collect(); + assert!( + lines + .iter() + .any(|line| line.contains("IndexSeek") && line.contains("ix_probe_code")), + "expected IndexSeek on ix_probe_code, got: {lines:?}" + ); + + let result = db + .execute("SELECT * FROM Probe WHERE Code = 'code_002500' ORDER BY Id LIMIT 1") + .unwrap(); + assert_eq!(result.rows().len(), 1); + assert_eq!( + result.rows()[0].values()[1], + Value::Text("code_002500".to_string()) + ); + // TempDir is automatically cleaned up +} diff --git a/docs/about/changelog.md b/docs/about/changelog.md index f6bebb3..cc6c203 100644 --- a/docs/about/changelog.md +++ b/docs/about/changelog.md @@ -7,6 +7,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed + +- Removed the 250,000-row limit on the paged row locator cache that caused + indexed equality lookups on large checkpointed tables to degrade to + O(manifest_size) per-row linear scans. The cache is now built for all + tables with btree indexes or row_id alias columns, regardless of row count. + This fixes multi-second indexed lookups observed on tables with millions of + rows (Melodee DDB-002/DDB-003). + +### Added + +- Added regression tests for indexed seek performance on checkpointed/paged + tables with 300K+ rows, verifying bounded lookup time after checkpoint and + reopen. + ## [2.10.0] - [2026-06-10] From 415522a719aaa4a43a3a89e7079ef7f4d1a23aa6 Mon Sep 17 00:00:00 2001 From: Steven Hildreth Date: Thu, 11 Jun 2026 11:31:20 -0500 Subject: [PATCH 2/9] refactor(tests): improve statement preparation formatting for clarity --- crates/decentdb/src/exec/mod.rs | 3 +-- .../tests/sql_indexed_seek_checkpointed_perf_tests.rs | 10 ++++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/crates/decentdb/src/exec/mod.rs b/crates/decentdb/src/exec/mod.rs index e4f34c7..9aafdbc 100644 --- a/crates/decentdb/src/exec/mod.rs +++ b/crates/decentdb/src/exec/mod.rs @@ -2038,8 +2038,7 @@ impl EngineRuntime { table_name: &str, state: PersistedTableState, ) -> Result> { - let needs_locator_cache = - self.should_cache_deferred_paged_row_locators(table_name); + let needs_locator_cache = self.should_cache_deferred_paged_row_locators(table_name); if !db.config().persistent_pk_index && !needs_locator_cache { self.deferred_paged_row_locator_caches_mut() .remove(table_name); diff --git a/crates/decentdb/tests/sql_indexed_seek_checkpointed_perf_tests.rs b/crates/decentdb/tests/sql_indexed_seek_checkpointed_perf_tests.rs index b385e82..1f1d10c 100644 --- a/crates/decentdb/tests/sql_indexed_seek_checkpointed_perf_tests.rs +++ b/crates/decentdb/tests/sql_indexed_seek_checkpointed_perf_tests.rs @@ -69,9 +69,9 @@ fn indexed_equality_on_large_checkpointed_table_is_bounded() { // Insert 300,000 rows using transactions (exceeds old 250,000 limit) let row_count: i64 = 300_000; let mut txn = db.transaction().unwrap(); - let stmt = txn.prepare( - "INSERT INTO Artist VALUES ($1, $2, $3, $4, $5, $6)" - ).unwrap(); + let stmt = txn + .prepare("INSERT INTO Artist VALUES ($1, $2, $3, $4, $5, $6)") + .unwrap(); for i in 1..=row_count { stmt.execute_in( &mut txn, @@ -236,7 +236,9 @@ fn explain_reports_index_seek_for_checkpointed_table() { .unwrap(); let mut txn = db.transaction().unwrap(); - let stmt = txn.prepare("INSERT INTO Probe VALUES ($1, $2, $3)").unwrap(); + let stmt = txn + .prepare("INSERT INTO Probe VALUES ($1, $2, $3)") + .unwrap(); for i in 1..=5_000 { stmt.execute_in( &mut txn, From d33080408df695b29f79c332665bfc1f68aecf67 Mon Sep 17 00:00:00 2001 From: Steven Hildreth Date: Thu, 11 Jun 2026 13:46:27 -0500 Subject: [PATCH 3/9] feat(exec): add support for bounded range queries with optional limits --- crates/decentdb/src/db.rs | 96 ++++++++++++++++++++++++++++++ crates/decentdb/src/exec/mod.rs | 101 +++++++++++++++++++++----------- 2 files changed, 163 insertions(+), 34 deletions(-) diff --git a/crates/decentdb/src/db.rs b/crates/decentdb/src/db.rs index 69d59a8..409d31a 100644 --- a/crates/decentdb/src/db.rs +++ b/crates/decentdb/src/db.rs @@ -177,12 +177,21 @@ pub struct PreparedStatement { temp_schema_cookie: u32, statement: Arc, prepared_sql: String, + simple_row_id_projection: Option, prepared_insert: Option>, prepared_update: Option>, prepared_delete: Option>, read_only: bool, } +#[derive(Clone, Debug)] +struct PreparedSimpleRowIdProjection { + table_name: String, + projection_columns: Vec, + filter_column: String, + param_index: usize, +} + /// Transaction-scoped prepared statement executor for repeated rows. /// /// This handle validates the prepared statement and resolves the insert fast @@ -4272,11 +4281,68 @@ impl Db { } } + fn try_execute_prepared_simple_row_id_projection( + &self, + prepared: &PreparedStatement, + params: &[Value], + ) -> Result> { + if self.inner.sql_txn_active.load(Ordering::Acquire) { + return Ok(None); + } + let Some(plan) = prepared.simple_row_id_projection.as_ref() else { + return Ok(None); + }; + let Some(Value::Int64(lookup_row_id)) = params.get(plan.param_index) else { + return Ok(None); + }; + + let reader = self.inner.wal.begin_reader_with_pager(&self.inner.pager)?; + let snapshot_lsn = reader.snapshot_lsn(); + self.refresh_engine_from_snapshot(snapshot_lsn)?; + if self.ensure_security_tables_loaded_at_snapshot(snapshot_lsn)? { + return Ok(None); + } + let runtime = self + .inner + .engine + .read() + .map_err(|_| DbError::internal("engine runtime lock poisoned"))?; + self.validate_prepared_schema_cookie( + prepared, + runtime.catalog.schema_cookie, + runtime.temp_schema_cookie, + )?; + let projection_columns = plan + .projection_columns + .iter() + .map(String::as_str) + .collect::>(); + let result = + runtime.execute_simple_row_id_projection_at_snapshot(SimpleRowIdProjectionRequest { + table_name: plan.table_name.as_str(), + projection_columns: &projection_columns, + filter_column: plan.filter_column.as_str(), + lookup_row_id: *lookup_row_id, + pager: &self.inner.pager, + wal: &self.inner.wal, + snapshot_lsn, + use_persistent_pk_index: self.inner.config.persistent_pk_index, + })?; + drop(runtime); + drop(reader); + Ok(result) + } + fn execute_prepared_read_statement( &self, prepared: &PreparedStatement, params: &[Value], ) -> Result { + if let Some(result) = + self.try_execute_prepared_simple_row_id_projection(prepared, params)? + { + return Ok(result); + } { let runtime = self .inner @@ -5569,12 +5635,26 @@ impl Db { ), _ => (None, None, None), }; + let simple_row_id_projection = + parse_simple_row_id_projection_sql(&prepared_sql).map(|plan| { + PreparedSimpleRowIdProjection { + table_name: plan.table_name.to_string(), + projection_columns: plan + .projection_columns + .into_iter() + .map(str::to_string) + .collect(), + filter_column: plan.filter_column.to_string(), + param_index: plan.param_index, + } + }); Ok(PreparedStatement { db: self.clone(), schema_cookie: runtime.catalog.schema_cookie, temp_schema_cookie: runtime.temp_schema_cookie, statement: Arc::clone(&statement), prepared_sql: prepared_sql.clone(), + simple_row_id_projection, prepared_insert, prepared_update, prepared_delete, @@ -29283,6 +29363,22 @@ mod tests { assert_eq!(result.rows()[0].values(), &[Value::Text("u10".to_string())]); assert_eq!(result.rows()[2].values(), &[Value::Text("u12".to_string())]); + let lower_limit = db + .prepare("SELECT name FROM users WHERE id >= $1 LIMIT $2") + .expect("prepare lower-bound range lookup"); + let result = lower_limit + .execute(&[Value::Int64(200), Value::Int64(2)]) + .expect("execute lower-bound range lookup"); + assert_eq!(result.rows().len(), 2); + assert_eq!( + result.rows()[0].values(), + &[Value::Text("u200".to_string())] + ); + assert_eq!( + result.rows()[1].values(), + &[Value::Text("u201".to_string())] + ); + let json_after = db .inspect_storage_state_json() .expect("json after prepared range lookup"); diff --git a/crates/decentdb/src/exec/mod.rs b/crates/decentdb/src/exec/mod.rs index 9aafdbc..47b3708 100644 --- a/crates/decentdb/src/exec/mod.rs +++ b/crates/decentdb/src/exec/mod.rs @@ -9545,25 +9545,29 @@ impl EngineRuntime { limit: Option, offset: usize, ) -> Result> { - if lower_bound.is_none() || upper_bound.is_none() { + let lower_only_limited = lower_bound.is_some() && upper_bound.is_none() && limit.is_some(); + let bounded_range = lower_bound.is_some() && upper_bound.is_some(); + if !bounded_range && !lower_only_limited { return Ok(None); } - if order_by.len() != 1 || order_by[0].descending { - return Ok(None); - } - let Expr::Column { - table: order_table, - column: order_column, - } = &order_by[0].expr - else { - return Ok(None); - }; - if !identifiers_equal(order_column, filter_column) - || order_table - .as_deref() - .is_some_and(|qualifier| !matches_table_binding(table_binding, Some(qualifier))) - { - return Ok(None); + if !order_by.is_empty() { + if order_by.len() != 1 || order_by[0].descending { + return Ok(None); + } + let Expr::Column { + table: order_table, + column: order_column, + } = &order_by[0].expr + else { + return Ok(None); + }; + if !identifiers_equal(order_column, filter_column) + || order_table + .as_deref() + .is_some_and(|qualifier| !matches_table_binding(table_binding, Some(qualifier))) + { + return Ok(None); + } } if !table_schema .primary_key_columns @@ -9591,10 +9595,25 @@ impl EngineRuntime { } let take = limit.unwrap_or(usize::MAX); + let max_probe_steps = if upper_bound.is_none() { + Some( + row_source + .row_count() + .saturating_add(offset) + .saturating_add(take), + ) + } else { + None + }; let mut skipped = 0usize; let mut rows = Vec::with_capacity(take.min(64)); let mut row_id = start; + let mut probe_steps = 0usize; while row_id < end_exclusive && rows.len() < take { + if max_probe_steps.is_some_and(|max_probe_steps| probe_steps >= max_probe_steps) { + return Ok(None); + } + probe_steps = probe_steps.saturating_add(1); if let Some(row) = row_source.row_by_id(row_id)? { if skipped < offset { skipped += 1; @@ -9631,25 +9650,29 @@ impl EngineRuntime { use_persistent_pk_index: bool, paged_locator_cache: Option<&DeferredPagedRowLocatorCache>, ) -> Result> { - if lower_bound.is_none() || upper_bound.is_none() { - return Ok(None); - } - if order_by.len() != 1 || order_by[0].descending { + let lower_only_limited = lower_bound.is_some() && upper_bound.is_none() && limit.is_some(); + let bounded_range = lower_bound.is_some() && upper_bound.is_some(); + if !bounded_range && !lower_only_limited { return Ok(None); } - let Expr::Column { - table: order_table, - column: order_column, - } = &order_by[0].expr - else { - return Ok(None); - }; - if !identifiers_equal(order_column, filter_column) - || order_table - .as_deref() - .is_some_and(|qualifier| !matches_table_binding(table_binding, Some(qualifier))) - { - return Ok(None); + if !order_by.is_empty() { + if order_by.len() != 1 || order_by[0].descending { + return Ok(None); + } + let Expr::Column { + table: order_table, + column: order_column, + } = &order_by[0].expr + else { + return Ok(None); + }; + if !identifiers_equal(order_column, filter_column) + || order_table + .as_deref() + .is_some_and(|qualifier| !matches_table_binding(table_binding, Some(qualifier))) + { + return Ok(None); + } } if !table_schema .primary_key_columns @@ -9687,10 +9710,20 @@ impl EngineRuntime { } let take = limit.unwrap_or(usize::MAX); + let max_probe_steps = if upper_bound.is_none() { + Some(state.row_count.saturating_add(offset).saturating_add(take)) + } else { + None + }; let mut skipped = 0usize; let mut rows = Vec::with_capacity(take.min(64)); let mut row_id = start; + let mut probe_steps = 0usize; while row_id < end_exclusive && rows.len() < take { + if max_probe_steps.is_some_and(|max_probe_steps| probe_steps >= max_probe_steps) { + return Ok(None); + } + probe_steps = probe_steps.saturating_add(1); if let Some(row) = read_deferred_stored_row_by_id( store, state, From e269a933bf6d7021818d1e1053a7645aeae4986b Mon Sep 17 00:00:00 2001 From: Steven Hildreth Date: Thu, 11 Jun 2026 16:44:34 -0500 Subject: [PATCH 4/9] Add Metric Improvements Plan for DecentDB performance enhancement This document outlines the strategy for improving DecentDB's performance metrics in comparison to SQLite. It includes baseline metrics, execution plans, and completion criteria to ensure that enhancements do not regress durability or correctness. The focus is on achieving competitive performance in key areas such as point lookup latency, range scans, joins, and aggregate operations while maintaining existing strengths in bulk insert throughput and other metrics. --- benchmarks/rust-baseline/Cargo.lock | 93 +++ benchmarks/rust-baseline/Cargo.toml | 1 + benchmarks/rust-baseline/README.md | 89 ++- benchmarks/rust-baseline/src/main.rs | 380 ++++++++- crates/decentdb/src/db.rs | 340 ++++++++ crates/decentdb/src/exec/mod.rs | 1085 +++++++++++++++++++++++++- design/METRIC_IMPROVEMENTS_PLAN.md | 172 ++++ 7 files changed, 2118 insertions(+), 42 deletions(-) create mode 100644 design/METRIC_IMPROVEMENTS_PLAN.md diff --git a/benchmarks/rust-baseline/Cargo.lock b/benchmarks/rust-baseline/Cargo.lock index 3653352..0f000ca 100644 --- a/benchmarks/rust-baseline/Cargo.lock +++ b/benchmarks/rust-baseline/Cargo.lock @@ -8,6 +8,18 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.4" @@ -368,6 +380,7 @@ dependencies = [ "anyhow", "clap", "decentdb", + "rusqlite", "serde", "serde_json", ] @@ -438,6 +451,18 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "fallible-iterator" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + [[package]] name = "fastrand" version = "2.4.1" @@ -526,6 +551,15 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash", +] + [[package]] name = "hashbrown" version = "0.15.5" @@ -541,6 +575,15 @@ version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" +[[package]] +name = "hashlink" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af" +dependencies = [ + "hashbrown 0.14.5", +] + [[package]] name = "heck" version = "0.5.0" @@ -688,6 +731,16 @@ dependencies = [ "pg_query", ] +[[package]] +name = "libsqlite3-sys" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c10584274047cb335c23d3e61bcef8e323adae7c5c8c760540f73610177fc3f" +dependencies = [ + "pkg-config", + "vcpkg", +] + [[package]] name = "linux-raw-sys" version = "0.4.15" @@ -1040,6 +1093,20 @@ version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" +[[package]] +name = "rusqlite" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b838eba278d213a8beaf485bd313fd580ca4505a00d5871caeb1457c55322cae" +dependencies = [ + "bitflags", + "fallible-iterator", + "fallible-streaming-iterator", + "hashlink", + "libsqlite3-sys", + "smallvec", +] + [[package]] name = "rustc-hash" version = "1.1.0" @@ -1340,6 +1407,12 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.5" @@ -1714,6 +1787,26 @@ dependencies = [ "wasmparser", ] +[[package]] +name = "zerocopy" +version = "0.8.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce1022995ff5ff5d841ad7d994facc23098cd40152f2c1d11cd607c6f530653f" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "zeroize" version = "1.8.2" diff --git a/benchmarks/rust-baseline/Cargo.toml b/benchmarks/rust-baseline/Cargo.toml index 908e91f..b33e0af 100644 --- a/benchmarks/rust-baseline/Cargo.toml +++ b/benchmarks/rust-baseline/Cargo.toml @@ -30,6 +30,7 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" clap = { version = "4", features = ["derive"] } anyhow = "1" +rusqlite = "0.31" [profile.release] opt-level = 3 diff --git a/benchmarks/rust-baseline/README.md b/benchmarks/rust-baseline/README.md index 925348c..0fb8f0a 100644 --- a/benchmarks/rust-baseline/README.md +++ b/benchmarks/rust-baseline/README.md @@ -1,7 +1,14 @@ -# DecentDB raw-engine baseline benchmark +# DecentDB rust-baseline benchmark -This is a **raw Rust baseline** for the same benchmark suite the .NET tests -in `..` exercise. It links the `decentdb` crate directly (path-dep against +This benchmark is the apples-to-apples Rust runner for the music-library +workload used to compare DecentDB against SQLite. By default it runs DecentDB +directly through the Rust crate. With `--engine sqlite`, it runs the same schema, +seed plan, and query shapes through `rusqlite`. + +The SQLite path exists only in this benchmark crate. It does not add SQLite +tests, dependencies, or comparison behavior to the DecentDB engine core. + +The default DecentDB path links the `decentdb` crate directly (path-dep against `../../crates/decentdb`) and uses the engine's hot-path API: - `Db::create()` to make a fresh database @@ -14,6 +21,11 @@ There is **no FFI, no marshalling, no LINQ, no parameter rewriter**, and no ADO.NET command/connection layer — so the timings here represent the theoretical engine ceiling that any binding could approach but never beat. +The SQLite path uses `rusqlite` against the same generated workload, with +`journal_mode=WAL`, `synchronous=FULL`, and `wal_autocheckpoint=0`. Each seed +phase runs in one explicit `BEGIN IMMEDIATE` transaction, and query timing +materializes every returned column before counting a row. + ## Schema and queries - `artists`, `albums`, `songs` tables with the same columns/PKs. @@ -44,20 +56,45 @@ counts are reported as `Plan: artists=… total_albums=… total_songs=…`. ## Build & run ```bash -cd /home/steven/source/decentdb/benchmarks/rust-baseline +cd /home/steven/src/github/decentdb/benchmarks/rust-baseline cargo build --release -./target/release/rust-baseline --scale smoke -./target/release/rust-baseline --scale medium -./target/release/rust-baseline --scale full -./target/release/rust-baseline --scale huge -./target/release/rust-baseline --scale full --profile resident-hot-read +./target/release/rust-baseline --engine decentdb --scale smoke +./target/release/rust-baseline --engine decentdb --scale medium +./target/release/rust-baseline --engine decentdb --scale full +./target/release/rust-baseline --engine decentdb --scale huge +./target/release/rust-baseline --engine sqlite --scale smoke +./target/release/rust-baseline --engine decentdb --scale full --profile resident-hot-read ./target/release/rust-baseline --report ./target/release/rust-baseline --report --report-file /tmp/rust-baseline-report.html ``` +To run the full DecentDB-vs-SQLite comparison into a temporary output +directory, use: + +```bash +cd /home/steven/src/github/decentdb/benchmarks/rust-baseline +cargo build --release +OUT="$PWD/../../.tmp/rust-baseline-compare/results" +DBS="$PWD/../../.tmp/rust-baseline-compare/dbs" +mkdir -p "$OUT" "$DBS" +for scale in smoke medium full huge; do + ./target/release/rust-baseline \ + --engine decentdb \ + --scale "$scale" \ + --out-dir "$OUT" \ + --db-path "$DBS/run-decentdb-$scale.ddb" + ./target/release/rust-baseline \ + --engine sqlite \ + --scale "$scale" \ + --out-dir "$OUT" \ + --db-path "$DBS/run-sqlite-$scale.db" +done +``` + ## Profiles -The default profile uses `DbConfig::default()`: durable WAL, deferred table +`--profile` applies only to `--engine decentdb`. The default profile uses +`DbConfig::default()`: durable WAL, deferred table materialization, and paged row storage with post-commit re-deferral. It is the low-memory profile and should remain the default historical comparison. @@ -68,22 +105,32 @@ sources resident after commit instead of dropping them back to the deferred set. This is a fair profile only when reported separately from default because it trades higher process memory for lower repeated read cost. +SQLite runs always use benchmark profile `sqlite-wal-full` and reject +DecentDB-only profiles. + ## Results JSON reports are written to `results/-rust-baseline--.json` where `` is -`YYYY-MM-DD-HHMM` (e.g., `2026-04-26-1430`). Older checked-in reports omit the -profile segment and are treated as the default profile. This timestamped naming -enables historical comparisons across multiple runs: +`YYYY-MM-DD-HHMM` (e.g., `2026-04-26-1430`). DecentDB default runs use +`default`; tuned DecentDB runs use their selected profile name; SQLite runs use +`sqlite-wal-full`. Older checked-in reports omit the profile segment and are +treated as the default profile. This timestamped naming enables historical +comparisons across multiple runs: ``` results/ ├── 2026-03-24-1200-rust-baseline-full.json -├── 2026-04-01-0900-rust-baseline-full.json -├── 2026-04-26-1430-rust-baseline-full.json +├── 2026-04-26-1430-rust-baseline-default-full.json +├── 2026-06-11-1215-rust-baseline-sqlite-wal-full-full.json └── ... ``` +Each JSON report records `binding`, `benchmark_profile`, `engine_version`, +database/WAL size, peak RSS, total runtime, and every instrumented step. Use +`binding` to separate DecentDB (`RustRaw`) from SQLite (`SQLiteRusqlite`) when +comparing runs programmatically. + ### Historical HTML report `--report` is a **report-only** mode: it does not run a benchmark. Instead it @@ -102,18 +149,6 @@ The generated report includes: Use `--report-file ` with `--report` to override the output path. -## Headline numbers (engine 2.3.1, scale=`full`, ≈2.75M songs) - -| metric | RustRaw | -|------------------------------|----------:| -| `seed_artists` r/s | 792,664 | -| `seed_albums` r/s | 786,594 | -| `seed_songs` r/s | 672,241 | -| `seed_songs` slowdown vs raw | 1.00× | -| `query_top10_albums` (s) | 3.235 | -| peak RSS | 2.2 GB | -| DB size | 144.9 MB | - ## Engine memory observation (worth filing) The Rust baseline's **peak RSS climbs to 2.2 GB** on `full` while the diff --git a/benchmarks/rust-baseline/src/main.rs b/benchmarks/rust-baseline/src/main.rs index 561bd28..36fa4c2 100644 --- a/benchmarks/rust-baseline/src/main.rs +++ b/benchmarks/rust-baseline/src/main.rs @@ -1,8 +1,7 @@ -// DecentDB raw-engine baseline benchmark. +// DecentDB rust-baseline benchmark. // -// Mirrors the schema/queries used by the .NET AdoNet/MicroOrm/EfCore benchmark -// suite at /tmp/tmp-opus47-decentdb-net-tests, but skips every layer above the -// `decentdb` crate so the numbers represent the engine's theoretical ceiling. +// Mirrors the schema and query shapes used by the music-library comparison +// workload, with direct DecentDB and SQLite reference runners. // // Hot-path pattern (identical to the internal `decentdb-benchmark` scenarios): // 1. db.transaction() -> SqlTransaction (exclusive runtime state) @@ -12,8 +11,8 @@ // // Scales mirror DecentDB.Compare.Common.Scale exactly. // -// Output: pretty-printed JSON to results/-rust-baseline-.json with the -// same shape as RunReport so it can be diffed against the .NET reports. +// Output: pretty-printed JSON to +// results/-rust-baseline--.json. use std::fs; use std::fmt::Write as _; @@ -23,10 +22,11 @@ use std::time::Instant; use anyhow::{bail, Context}; use clap::{Parser, ValueEnum}; use decentdb::{DbConfig, PreparedStatement, Value}; +use rusqlite::{params, Connection as SqliteConnection}; use serde::{Deserialize, Serialize}; #[derive(Parser, Debug)] -#[command(version, about = "DecentDB raw-engine baseline benchmark")] +#[command(version, about = "DecentDB rust-baseline benchmark")] struct Cli { /// Scale: smoke | medium | full | huge #[arg(long, default_value = "smoke")] @@ -34,7 +34,7 @@ struct Cli { /// Output directory for JSON report. #[arg(long, default_value = "results")] out_dir: PathBuf, - /// Database path (defaults to ./run-rust-.ddb). + /// Database path (defaults by engine and scale). #[arg(long)] db_path: Option, /// Seed for the deterministic plan. @@ -43,6 +43,9 @@ struct Cli { /// Engine profile: default | resident-hot-read. #[arg(long, value_enum, default_value_t = BenchmarkProfile::Default)] profile: BenchmarkProfile, + /// Engine implementation: decentdb | sqlite. + #[arg(long, value_enum, default_value_t = BenchmarkEngine::DecentDb)] + engine: BenchmarkEngine, /// Generate an HTML report from historical JSON files in the output directory. #[arg(long)] report: bool, @@ -51,6 +54,29 @@ struct Cli { report_file: Option, } +#[derive(Clone, Copy, Debug, Eq, PartialEq, ValueEnum)] +enum BenchmarkEngine { + #[value(name = "decentdb", alias = "decent-db")] + DecentDb, + Sqlite, +} + +impl BenchmarkEngine { + fn binding_name(self) -> &'static str { + match self { + Self::DecentDb => "RustRaw", + Self::Sqlite => "SQLiteRusqlite", + } + } + + fn default_db_path(self, scale: Scale) -> PathBuf { + match self { + Self::DecentDb => PathBuf::from(format!("run-rust-{}.ddb", scale.name)), + Self::Sqlite => PathBuf::from(format!("run-rust-sqlite-{}.db", scale.name)), + } + } +} + #[derive(Clone, Copy, Debug, Eq, PartialEq, ValueEnum)] enum BenchmarkProfile { /// Default durable engine configuration. @@ -565,13 +591,16 @@ fn run(cli: Cli) -> anyhow::Result<()> { } let scale = parse_scale(&cli.scale); + if cli.engine == BenchmarkEngine::Sqlite && cli.profile != BenchmarkProfile::Default { + bail!("--profile is only supported for --engine decentdb"); + } let db_path = cli .db_path - .unwrap_or_else(|| PathBuf::from(format!("run-rust-{}.ddb", scale.name))); + .unwrap_or_else(|| cli.engine.default_db_path(scale)); println!( - "Summarizing seed plan: scale={} artists={} albums(target)={} songs_cap={}", - scale.name, scale.artists, scale.albums, scale.songs_cap + "Summarizing seed plan: engine={:?} scale={} artists={} albums(target)={} songs_cap={}", + cli.engine, scale.name, scale.artists, scale.albums, scale.songs_cap ); let summary = summarize_seed_plan(scale, cli.seed); println!( @@ -579,10 +608,14 @@ fn run(cli: Cli) -> anyhow::Result<()> { scale.artists, summary.total_albums, summary.total_songs ); + if cli.engine == BenchmarkEngine::Sqlite { + return run_sqlite_benchmark(scale, cli.seed, summary, db_path, cli.out_dir); + } + delete_db_files(&db_path); let mut report = RunReport { - binding: "RustRaw".to_string(), + binding: cli.engine.binding_name().to_string(), scale_name: scale.name.to_string(), benchmark_profile: cli.profile.as_str().to_string(), target_artists: scale.artists, @@ -766,7 +799,7 @@ fn run(cli: Cli) -> anyhow::Result<()> { let datetime_stamp = format_unix_filename_stamp(report.finished_unix); let out_path = cli.out_dir.join(format!( "{datetime_stamp}-rust-baseline-{}-{}.json", - cli.profile.as_str(), + report.benchmark_profile.as_str(), scale.name )); fs::write(&out_path, serde_json::to_string_pretty(&report)?)?; @@ -778,6 +811,327 @@ fn run(cli: Cli) -> anyhow::Result<()> { Ok(()) } +fn run_sqlite_benchmark( + scale: Scale, + seed: u64, + summary: SeedSummary, + db_path: PathBuf, + out_dir: PathBuf, +) -> anyhow::Result<()> { + delete_db_files(&db_path); + + let mut report = RunReport { + binding: BenchmarkEngine::Sqlite.binding_name().to_string(), + scale_name: scale.name.to_string(), + benchmark_profile: "sqlite-wal-full".to_string(), + target_artists: scale.artists, + target_albums: scale.albums, + target_songs_cap: scale.songs_cap, + started_unix: now_unix(), + database_path: db_path.display().to_string(), + ..Default::default() + }; + let mut rec = Recorder::new(&mut report); + + let conn = rec.measure("connect_open", None, || { + open_sqlite_wal_full(&db_path).expect("open sqlite") + }); + rec.report.engine_version = sqlite_engine_version(&conn).unwrap_or_else(|_| "unknown".into()); + + let ddl_batch = build_schema_ddl_batch(); + rec.measure("schema_create", None, || { + conn.execute_batch(&ddl_batch).expect("sqlite ddl batch"); + }); + + let mut insert_artist = conn + .prepare( + "INSERT INTO artists (id, name, country, formed_year) \ + VALUES (?1, ?2, ?3, ?4)", + ) + .expect("prepare sqlite artists"); + rec.measure("seed_artists", Some(u64::from(scale.artists)), || { + seed_sqlite_artists(&conn, &mut insert_artist, scale, seed); + }); + drop(insert_artist); + + let mut insert_album = conn + .prepare( + "INSERT INTO albums (id, artist_id, title, release_year) \ + VALUES (?1, ?2, ?3, ?4)", + ) + .expect("prepare sqlite albums"); + rec.measure("seed_albums", Some(summary.total_albums), || { + seed_sqlite_albums(&conn, &mut insert_album, scale, seed); + }); + drop(insert_album); + + let mut insert_song = conn + .prepare( + "INSERT INTO songs (id, album_id, artist_id, title, duration_ms) \ + VALUES (?1, ?2, ?3, ?4, ?5)", + ) + .expect("prepare sqlite songs"); + rec.measure("seed_songs", Some(summary.total_songs), || { + seed_sqlite_songs(&conn, &mut insert_song, scale, seed); + }); + drop(insert_song); + + rec.measure("query_count_songs", None, || { + let count: i64 = conn + .query_row("SELECT COUNT(*) FROM songs", [], |row| row.get(0)) + .expect("sqlite count"); + println!(" count=Some(Int64({count}))"); + }); + let count: i64 = conn + .query_row("SELECT COUNT(*) FROM songs", [], |row| row.get(0)) + .expect("sqlite count extra"); + rec.add_extra("count", serde_json::json!(count)); + + rec.measure("query_aggregate_durations", None, || { + let row: (i64, i64, f64, i64, i64) = conn + .query_row( + "SELECT COUNT(*), SUM(duration_ms), AVG(duration_ms), \ + MIN(duration_ms), MAX(duration_ms) FROM songs", + [], + |row| { + Ok(( + row.get(0)?, + row.get(1)?, + row.get(2)?, + row.get(3)?, + row.get(4)?, + )) + }, + ) + .expect("sqlite aggregate durations"); + println!(" agg_row={row:?}"); + }); + + rec.measure("query_artist_by_id", None, || { + let target = i64::from(scale.artists) / 2 + 1; + let row: (i64, String, String, i64) = conn + .query_row( + "SELECT id, name, country, formed_year FROM artists WHERE id = ?1", + params![target], + |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?)), + ) + .expect("sqlite artist by id"); + println!(" artist={row:?}"); + }); + + rec.measure("query_top10_artists_by_songs", None, || { + let rows = sqlite_query_row_count( + &conn, + "SELECT a.id, a.name, COUNT(s.id) AS song_count + FROM artists a + JOIN songs s ON s.artist_id = a.id + GROUP BY a.id, a.name + ORDER BY song_count DESC + LIMIT 10", + [], + ) + .expect("sqlite top10 artists"); + println!(" rows={rows}"); + }); + + rec.measure("query_top10_albums_by_songs", None, || { + let rows = sqlite_query_row_count( + &conn, + "SELECT al.id, al.title, COUNT(s.id) AS song_count + FROM albums al + JOIN songs s ON s.album_id = al.id + GROUP BY al.id, al.title + ORDER BY song_count DESC + LIMIT 10", + [], + ) + .expect("sqlite top10 albums"); + println!(" rows={rows}"); + }); + + rec.measure("query_view_first_1000", None, || { + let rows = sqlite_query_row_count( + &conn, + "SELECT artist_id, artist_name, album_title, song_title \ + FROM v_artist_songs LIMIT 1000", + [], + ) + .expect("sqlite view 1000"); + println!(" rows={rows}"); + }); + + rec.measure("query_songs_for_artist_via_view", None, || { + let rows = sqlite_query_row_count( + &conn, + "SELECT album_title, song_title, duration_ms \ + FROM v_artist_songs WHERE artist_id = ?1", + params![1_i64], + ) + .expect("sqlite artist 1 view"); + println!(" rows={rows}"); + }); + + if let Ok(meta) = fs::metadata(&db_path) { + rec.report.database_size_bytes = meta.len(); + } + if let Ok(meta) = fs::metadata(sqlite_wal_path(&db_path)) { + rec.report.wal_size_bytes = meta.len(); + } + rec.report.finished_unix = now_unix(); + + fs::create_dir_all(&out_dir)?; + let datetime_stamp = format_unix_filename_stamp(rec.report.finished_unix); + let out_path = out_dir.join(format!( + "{datetime_stamp}-rust-baseline-{}-{}.json", + rec.report.benchmark_profile.as_str(), + scale.name + )); + fs::write(&out_path, serde_json::to_string_pretty(&rec.report)?)?; + println!("\nWrote {}", out_path.display()); + + drop(conn); + delete_db_files(&db_path); + println!("Cleaned up temp DB files: {}", db_path.display()); + + Ok(()) +} + +fn open_sqlite_wal_full(path: &Path) -> rusqlite::Result { + let conn = SqliteConnection::open(path)?; + let journal_mode: String = conn.query_row("PRAGMA journal_mode=WAL;", [], |row| row.get(0))?; + assert_eq!(journal_mode.to_ascii_lowercase(), "wal"); + conn.execute_batch( + "PRAGMA synchronous=FULL; + PRAGMA wal_autocheckpoint=0;", + )?; + let synchronous: i64 = conn.query_row("PRAGMA synchronous;", [], |row| row.get(0))?; + assert_eq!(synchronous, 2, "expected SQLite synchronous=FULL"); + let wal_autocheckpoint: i64 = + conn.query_row("PRAGMA wal_autocheckpoint;", [], |row| row.get(0))?; + assert_eq!( + wal_autocheckpoint, 0, + "expected SQLite wal_autocheckpoint=0" + ); + Ok(conn) +} + +fn sqlite_engine_version(conn: &SqliteConnection) -> rusqlite::Result { + conn.query_row("SELECT sqlite_version()", [], |row| row.get(0)) +} + +fn sqlite_wal_path(path: &Path) -> PathBuf { + let mut wal = path.as_os_str().to_owned(); + wal.push("-wal"); + PathBuf::from(wal) +} + +fn seed_sqlite_artists( + conn: &SqliteConnection, + stmt: &mut rusqlite::Statement<'_>, + scale: Scale, + seed: u64, +) { + conn.execute_batch("BEGIN IMMEDIATE;") + .expect("begin sqlite artists"); + let mut artist_name = String::with_capacity(32); + walk_seed_plan_select( + scale, + seed, + SeedWalkEmit::ARTISTS, + |a| { + artist_name.clear(); + artist_name.push_str("Artist "); + write!(&mut artist_name, "{}", a.id).expect("write artist name"); + stmt.execute(params![a.id, artist_name.as_str(), a.country, a.formed_year]) + .expect("sqlite insert artist"); + }, + |_| {}, + |_| {}, + ); + conn.execute_batch("COMMIT;").expect("commit sqlite artists"); +} + +fn seed_sqlite_albums( + conn: &SqliteConnection, + stmt: &mut rusqlite::Statement<'_>, + scale: Scale, + seed: u64, +) { + conn.execute_batch("BEGIN IMMEDIATE;") + .expect("begin sqlite albums"); + let mut album_title = String::with_capacity(32); + walk_seed_plan_select( + scale, + seed, + SeedWalkEmit::ALBUMS, + |_| {}, + |al| { + album_title.clear(); + album_title.push_str("Album "); + write!(&mut album_title, "{}", al.id).expect("write album title"); + stmt.execute(params![ + al.id, + al.artist_id, + album_title.as_str(), + al.release_year + ]) + .expect("sqlite insert album"); + }, + |_| {}, + ); + conn.execute_batch("COMMIT;").expect("commit sqlite albums"); +} + +fn seed_sqlite_songs( + conn: &SqliteConnection, + stmt: &mut rusqlite::Statement<'_>, + scale: Scale, + seed: u64, +) { + conn.execute_batch("BEGIN IMMEDIATE;") + .expect("begin sqlite songs"); + let mut song_title = String::with_capacity(32); + walk_seed_plan_select( + scale, + seed, + SeedWalkEmit::SONGS, + |_| {}, + |_| {}, + |s| { + song_title.clear(); + song_title.push_str("Song "); + write!(&mut song_title, "{}", s.id).expect("write song title"); + stmt.execute(params![ + s.id, + s.album_id, + s.artist_id, + song_title.as_str(), + s.duration_ms + ]) + .expect("sqlite insert song"); + }, + ); + conn.execute_batch("COMMIT;").expect("commit sqlite songs"); +} + +fn sqlite_query_row_count( + conn: &SqliteConnection, + sql: &str, + params: P, +) -> rusqlite::Result { + let mut stmt = conn.prepare(sql)?; + let column_count = stmt.column_count(); + let mut rows = stmt.query(params)?; + let mut count = 0usize; + while let Some(row) = rows.next()? { + for index in 0..column_count { + let _: rusqlite::types::Value = row.get(index)?; + } + count += 1; + } + Ok(count) +} + fn seed_albums(db: &decentdb::Db, prepared: &PreparedStatement, scale: Scale, seed: u64) { let mut txn = db.transaction().expect("begin albums"); let params: &mut [Value] = &mut [ diff --git a/crates/decentdb/src/db.rs b/crates/decentdb/src/db.rs index 409d31a..6ebdadc 100644 --- a/crates/decentdb/src/db.rs +++ b/crates/decentdb/src/db.rs @@ -3309,6 +3309,18 @@ impl Db { return self.finalize_row_source_autocommit_statement(statement, Ok(result)); } if prepared.is_none() { + if let Some(result) = self + .try_execute_indexed_join_grouped_count_query_at_snapshot( + &runtime, + query, + params, + snapshot_lsn, + )? + { + drop(runtime); + return self + .finalize_row_source_autocommit_statement(statement, Ok(result)); + } if let Some(result) = self .try_execute_simple_indexed_join_projection_query_at_snapshot( &runtime, @@ -6908,6 +6920,35 @@ impl Db { Some(()) } + fn try_execute_indexed_join_grouped_count_query_at_snapshot( + &self, + runtime: &EngineRuntime, + query: &crate::sql::ast::Query, + params: &[Value], + snapshot_lsn: u64, + ) -> Result> { + if !runtime.has_deferred_tables() { + return Ok(None); + } + let Some(parent_table_name) = + runtime.indexed_join_grouped_count_parent_table_name(query, params)? + else { + return Ok(None); + }; + + let parent_table_name = parent_table_name.to_string(); + let mut join_runtime = runtime.clone(); + self.load_runtime_table_row_sources_at_snapshot( + &mut join_runtime, + &[parent_table_name.as_str()], + snapshot_lsn, + )?; + let result = join_runtime.try_execute_indexed_join_grouped_count_query(query, params); + drop(join_runtime); + self.release_freed_heap_after_paged_row_source_drop(); + result + } + fn try_execute_simple_indexed_join_projection_query_at_snapshot( &self, runtime: &EngineRuntime, @@ -7024,6 +7065,16 @@ impl Db { } if !security_active && !*indexes_maybe_stale { if let SqlStatement::Query(query) = statement { + if let Some(result) = self + .try_execute_indexed_join_grouped_count_query_at_snapshot( + runtime, + query, + params, + snapshot_lsn, + )? + { + return Ok(result); + } if let Some(result) = self .try_execute_simple_indexed_join_projection_query_at_snapshot( runtime, @@ -23265,6 +23316,215 @@ mod tests { ); } + #[test] + fn paged_row_storage_view_filter_indexed_join_chain_keeps_deferred_tables_unloaded() { + let tempdir = TempDir::new().expect("tempdir"); + let path = tempdir + .path() + .join("paged-row-storage-view-filtered-join-chain.ddb"); + let config = DbConfig { + paged_row_storage: true, + ..DbConfig::default() + }; + + { + let db = Db::open_or_create(&path, config.clone()).expect("open db"); + db.execute("CREATE TABLE artists (id INTEGER PRIMARY KEY, name TEXT NOT NULL)") + .expect("create artists"); + db.execute( + "CREATE TABLE albums (id INTEGER PRIMARY KEY, artist_id INTEGER NOT NULL, title TEXT)", + ) + .expect("create albums"); + db.execute( + "CREATE TABLE songs (id INTEGER PRIMARY KEY, album_id INTEGER NOT NULL, title TEXT, duration_ms INTEGER NOT NULL)", + ) + .expect("create songs"); + db.execute("CREATE INDEX idx_albums_artist ON albums (artist_id)") + .expect("create albums artist index"); + db.execute("CREATE INDEX idx_songs_album ON songs (album_id)") + .expect("create songs album index"); + db.execute( + "CREATE VIEW v_artist_songs AS \ + SELECT a.id AS artist_id, a.name AS artist_name, al.title AS album_title, \ + s.title AS song_title, s.duration_ms AS duration_ms \ + FROM artists a JOIN albums al ON al.artist_id = a.id \ + JOIN songs s ON s.album_id = al.id", + ) + .expect("create view"); + + db.execute("INSERT INTO artists (id, name) VALUES (1, 'a')") + .expect("insert artist 1"); + db.execute("INSERT INTO artists (id, name) VALUES (2, 'b')") + .expect("insert artist 2"); + db.execute("INSERT INTO albums (id, artist_id, title) VALUES (10, 1, 'a1')") + .expect("insert album 1"); + db.execute("INSERT INTO albums (id, artist_id, title) VALUES (20, 2, 'b1')") + .expect("insert album 2"); + db.execute( + "INSERT INTO songs (id, album_id, title, duration_ms) VALUES (100, 10, 's1', 1000)", + ) + .expect("insert song 1"); + db.execute( + "INSERT INTO songs (id, album_id, title, duration_ms) VALUES (101, 10, 's2', 2000)", + ) + .expect("insert song 2"); + db.execute( + "INSERT INTO songs (id, album_id, title, duration_ms) VALUES (200, 20, 's3', 3000)", + ) + .expect("insert song 3"); + db.checkpoint().expect("checkpoint"); + } + + let db = Db::open_or_create(&path, config).expect("reopen db"); + let json_before = db + .inspect_storage_state_json() + .expect("json before view query"); + assert!( + json_before.contains("\"loaded_table_count\":0"), + "expected view base tables to start deferred, got: {json_before}" + ); + assert!( + json_before.contains("\"deferred_table_count\":3"), + "expected all view base tables deferred at reopen, got: {json_before}" + ); + + let result = db + .execute_with_params( + "SELECT album_title, song_title, duration_ms \ + FROM v_artist_songs WHERE artist_id = $1", + &[Value::Int64(1)], + ) + .expect("view filtered join query"); + assert_eq!(result.rows().len(), 2); + assert_eq!( + result.rows()[0].values(), + &[ + Value::Text("a1".to_string()), + Value::Text("s1".to_string()), + Value::Int64(1000) + ] + ); + assert_eq!( + result.rows()[1].values(), + &[ + Value::Text("a1".to_string()), + Value::Text("s2".to_string()), + Value::Int64(2000) + ] + ); + + let json_after = db + .inspect_storage_state_json() + .expect("json after view query"); + assert!( + json_after.contains("\"loaded_table_count\":0"), + "expected deferred view filtered join to avoid resident table materialization, got: {json_after}" + ); + assert!( + json_after.contains("\"deferred_table_count\":3"), + "expected deferred view filtered join to leave base tables deferred, got: {json_after}" + ); + } + + #[test] + fn paged_row_storage_view_limit_indexed_join_chain_keeps_deferred_tables_unloaded() { + let tempdir = TempDir::new().expect("tempdir"); + let path = tempdir + .path() + .join("paged-row-storage-view-limit-join-chain.ddb"); + let config = DbConfig { + paged_row_storage: true, + ..DbConfig::default() + }; + + { + let db = Db::open_or_create(&path, config.clone()).expect("open db"); + db.execute("CREATE TABLE artists (id INTEGER PRIMARY KEY, name TEXT NOT NULL)") + .expect("create artists"); + db.execute( + "CREATE TABLE albums (id INTEGER PRIMARY KEY, artist_id INTEGER NOT NULL, title TEXT)", + ) + .expect("create albums"); + db.execute( + "CREATE TABLE songs (id INTEGER PRIMARY KEY, album_id INTEGER NOT NULL, title TEXT, duration_ms INTEGER NOT NULL)", + ) + .expect("create songs"); + db.execute("CREATE INDEX idx_albums_artist ON albums (artist_id)") + .expect("create albums artist index"); + db.execute("CREATE INDEX idx_songs_album ON songs (album_id)") + .expect("create songs album index"); + db.execute( + "CREATE VIEW v_artist_songs AS \ + SELECT a.id AS artist_id, a.name AS artist_name, al.title AS album_title, \ + s.title AS song_title, s.duration_ms AS duration_ms \ + FROM artists a JOIN albums al ON al.artist_id = a.id \ + JOIN songs s ON s.album_id = al.id", + ) + .expect("create view"); + + db.execute("INSERT INTO artists (id, name) VALUES (1, 'a')") + .expect("insert artist 1"); + db.execute("INSERT INTO artists (id, name) VALUES (2, 'b')") + .expect("insert artist 2"); + db.execute("INSERT INTO albums (id, artist_id, title) VALUES (10, 1, 'a1')") + .expect("insert album 1"); + db.execute("INSERT INTO albums (id, artist_id, title) VALUES (20, 2, 'b1')") + .expect("insert album 2"); + db.execute( + "INSERT INTO songs (id, album_id, title, duration_ms) VALUES (100, 10, 's1', 1000)", + ) + .expect("insert song 1"); + db.execute( + "INSERT INTO songs (id, album_id, title, duration_ms) VALUES (101, 10, 's2', 2000)", + ) + .expect("insert song 2"); + db.execute( + "INSERT INTO songs (id, album_id, title, duration_ms) VALUES (200, 20, 's3', 3000)", + ) + .expect("insert song 3"); + db.checkpoint().expect("checkpoint"); + } + + let db = Db::open_or_create(&path, config).expect("reopen db"); + let result = db + .execute( + "SELECT artist_id, artist_name, album_title, song_title \ + FROM v_artist_songs LIMIT 2", + ) + .expect("view limit query"); + assert_eq!(result.rows().len(), 2); + assert_eq!( + result.rows()[0].values(), + &[ + Value::Int64(1), + Value::Text("a".to_string()), + Value::Text("a1".to_string()), + Value::Text("s1".to_string()) + ] + ); + assert_eq!( + result.rows()[1].values(), + &[ + Value::Int64(1), + Value::Text("a".to_string()), + Value::Text("a1".to_string()), + Value::Text("s2".to_string()) + ] + ); + + let json_after = db + .inspect_storage_state_json() + .expect("json after view limit query"); + assert!( + json_after.contains("\"loaded_table_count\":0"), + "expected deferred view limit to avoid resident table materialization, got: {json_after}" + ); + assert!( + json_after.contains("\"deferred_table_count\":3"), + "expected deferred view limit to leave base tables deferred, got: {json_after}" + ); + } + #[test] fn paged_row_storage_indexed_join_order_limit_offset_keeps_deferred_tables_unloaded() { let tempdir = TempDir::new().expect("tempdir"); @@ -26696,6 +26956,86 @@ mod tests { ); } + #[test] + fn paged_row_storage_indexed_join_grouped_count_keeps_deferred_tables_unloaded() { + let tempdir = TempDir::new().expect("tempdir"); + let path = tempdir + .path() + .join("paged-row-storage-indexed-join-grouped-count.ddb"); + let config = DbConfig { + paged_row_storage: true, + ..DbConfig::default() + }; + + { + let db = Db::open_or_create(&path, config.clone()).expect("create db"); + db.execute("CREATE TABLE artists (id INTEGER PRIMARY KEY, name TEXT NOT NULL)") + .expect("create artists"); + db.execute( + "CREATE TABLE songs (id INTEGER PRIMARY KEY, artist_id INTEGER NOT NULL, title TEXT, body TEXT)", + ) + .expect("create songs"); + db.execute("CREATE INDEX idx_songs_artist ON songs (artist_id)") + .expect("create song artist index"); + db.execute("INSERT INTO artists (id, name) VALUES (1, 'a')") + .expect("insert artist 1"); + db.execute("INSERT INTO artists (id, name) VALUES (2, 'b')") + .expect("insert artist 2"); + let large_body = "x".repeat(2048); + for (id, artist_id) in [(1, 1), (2, 1), (3, 2)] { + db.execute_with_params( + "INSERT INTO songs (id, artist_id, title, body) VALUES ($1, $2, $3, $4)", + &[ + Value::Int64(id), + Value::Int64(artist_id), + Value::Text(format!("s{id}")), + Value::Text(large_body.clone()), + ], + ) + .expect("insert song"); + } + db.checkpoint().expect("checkpoint"); + } + + let db = Db::open_or_create(&path, config).expect("reopen with paged storage"); + let result = db + .execute( + "SELECT a.id, a.name, COUNT(s.id) AS song_count \ + FROM artists a JOIN songs s ON s.artist_id = a.id \ + GROUP BY a.id, a.name ORDER BY song_count DESC LIMIT 10", + ) + .expect("indexed grouped count"); + assert_eq!(result.rows().len(), 2); + assert_eq!( + result.rows()[0].values(), + &[ + Value::Int64(1), + Value::Text("a".to_string()), + Value::Int64(2) + ] + ); + assert_eq!( + result.rows()[1].values(), + &[ + Value::Int64(2), + Value::Text("b".to_string()), + Value::Int64(1) + ] + ); + + let json_after = db + .inspect_storage_state_json() + .expect("json after indexed grouped count"); + assert!( + json_after.contains("\"loaded_table_count\":0"), + "expected indexed grouped count to use cloned row sources only, got: {json_after}" + ); + assert!( + json_after.contains("\"deferred_table_count\":2"), + "expected base tables to remain deferred after indexed grouped count, got: {json_after}" + ); + } + #[test] fn paged_row_storage_grouped_numeric_aggregate_keeps_deferred_table_unloaded() { let tempdir = TempDir::new().expect("tempdir"); diff --git a/crates/decentdb/src/exec/mod.rs b/crates/decentdb/src/exec/mod.rs index 47b3708..5c43dbc 100644 --- a/crates/decentdb/src/exec/mod.rs +++ b/crates/decentdb/src/exec/mod.rs @@ -103,6 +103,7 @@ const ENUM_COLUMNS_SECTION_MAGIC: &[u8; 8] = b"DDBENU01"; const FULL_TEXT_OPTIONS_SECTION_MAGIC: &[u8; 8] = b"DDBFTS01"; const SIGNED_ROW_ID_BIAS: u64 = 0x8000_0000_0000_0000; const DEFERRED_COMPRESSED_LOOKUP_CACHE_LIMIT: usize = 32; +const DEFERRED_VIEW_LIMIT_MIN_PERSISTED_ROWS: usize = 100_000; static RANDOM_STATE: AtomicU64 = AtomicU64::new(0); static DEFERRED_COMPRESSED_LOOKUP_CACHE: OnceLock> = OnceLock::new(); @@ -6465,7 +6466,7 @@ impl EngineRuntime { Ok(QueryResult::with_rows(column_names, rows)) } - fn try_execute_indexed_join_grouped_count_query( + pub(crate) fn try_execute_indexed_join_grouped_count_query( &self, query: &Query, params: &[Value], @@ -6556,6 +6557,16 @@ impl EngineRuntime { )?)) } + pub(crate) fn indexed_join_grouped_count_parent_table_name<'a>( + &'a self, + query: &'a Query, + params: &[Value], + ) -> Result> { + Ok(self + .analyze_indexed_join_grouped_count_query(query, params)? + .map(|plan| plan.parent_table_name)) + } + fn analyze_indexed_join_grouped_count_query<'a>( &'a self, query: &'a Query, @@ -11574,6 +11585,777 @@ impl EngineRuntime { ))) } + #[allow(clippy::too_many_arguments)] + fn try_execute_simple_deferred_view_projection_limit_query( + &self, + query: &Query, + params: &[Value], + pager: &PagerHandle, + wal: &WalHandle, + snapshot_lsn: u64, + use_persistent_pk_index: bool, + ) -> Result> { + if query.recursive || !query.ctes.is_empty() || !query.order_by.is_empty() { + return Ok(None); + } + let Some(limit_expr) = query.limit.as_ref() else { + return Ok(None); + }; + let ctes = BTreeMap::new(); + let limit = usize::try_from(self.eval_constant_i64(limit_expr, params, &ctes)?.max(0)) + .unwrap_or(usize::MAX); + let offset = query + .offset + .as_ref() + .map(|expr| self.eval_constant_i64(expr, params, &ctes)) + .transpose()? + .map(|value| usize::try_from(value.max(0)).unwrap_or(usize::MAX)) + .unwrap_or(0); + + let QueryBody::Select(select) = &query.body else { + return Ok(None); + }; + if select.distinct + || !select.distinct_on.is_empty() + || select.filter.is_some() + || !select.group_by.is_empty() + || select.having.is_some() + || projection_has_aggregate_items(&select.projection) + || select.from.len() != 1 + { + return Ok(None); + } + let FromItem::Table { + name: view_name, + alias: view_alias, + } = &select.from[0] + else { + return Ok(None); + }; + let Some(view) = self.visible_view(view_name, NameResolutionScope::Session) else { + return Ok(None); + }; + if view.temporary { + return Ok(None); + } + let view_binding = view_alias.as_deref().unwrap_or(view_name.as_str()); + + let view_statement = parse_sql_statement(&view.sql_text)?; + let Statement::Query(view_query) = view_statement else { + return Err(DbError::corruption(format!( + "view {} does not contain a SELECT statement", + view.name + ))); + }; + if view_query.recursive + || !view_query.ctes.is_empty() + || !view_query.order_by.is_empty() + || view_query.limit.is_some() + || view_query.offset.is_some() + { + return Ok(None); + } + let QueryBody::Select(view_select) = &view_query.body else { + return Ok(None); + }; + if view_select.distinct + || !view_select.distinct_on.is_empty() + || !view_select.group_by.is_empty() + || view_select.having.is_some() + || view_select.filter.is_some() + || projection_has_aggregate_items(&view_select.projection) + || view_select.from.len() != 1 + { + return Ok(None); + } + + let mut table_bindings = Vec::new(); + let mut join_constraints = Vec::new(); + if !flatten_inner_join_chain( + &view_select.from[0], + &mut table_bindings, + &mut join_constraints, + ) || !(2..=3).contains(&table_bindings.len()) + || join_constraints.len() + 1 != table_bindings.len() + { + return Ok(None); + } + + let mut table_schemas = Vec::with_capacity(table_bindings.len()); + for binding in &table_bindings { + if self + .visible_view(binding.name, NameResolutionScope::Session) + .is_some() + || self.visible_table_is_temporary(binding.name) + { + return Ok(None); + } + let Some(schema) = self.table_schema(binding.name) else { + return Ok(None); + }; + if !generated_columns_are_stored(schema) { + return Ok(None); + } + table_schemas.push(schema); + } + let deferred_row_count = table_bindings + .iter() + .filter(|binding| self.visible_table_row_source(binding.name).is_none()) + .filter_map(|binding| self.persisted_table_state(binding.name)) + .map(|state| state.row_count) + .sum::(); + if deferred_row_count < DEFERRED_VIEW_LIMIT_MIN_PERSISTED_ROWS { + return Ok(None); + } + + let mut projections = Vec::with_capacity(select.projection.len()); + let mut column_names = Vec::with_capacity(select.projection.len()); + for (ordinal, item) in select.projection.iter().enumerate() { + let SelectItem::Expr { expr, alias } = item else { + return Ok(None); + }; + let Expr::Column { + table: outer_table, + column: outer_column, + } = expr + else { + return Ok(None); + }; + if outer_table + .as_deref() + .is_some_and(|qualifier| !identifiers_equal(qualifier, view_binding)) + { + return Ok(None); + } + let Some(view_expr) = + view_projection_expr_for_output_column(&view_select.projection, outer_column) + else { + return Ok(None); + }; + let Expr::Column { + table: Some(base_table), + column: base_column, + } = view_expr + else { + return Ok(None); + }; + let Some(table_index) = table_bindings + .iter() + .position(|binding| identifiers_equal(binding.binding_name(), &base_table)) + else { + return Ok(None); + }; + let Some(column_index) = schema_column_index(table_schemas[table_index], &base_column) + else { + return Ok(None); + }; + projections.push(DeferredViewProjection { + table_index, + column_index, + }); + column_names.push( + alias + .clone() + .unwrap_or_else(|| infer_expr_name(expr, ordinal + 1)), + ); + } + if limit == 0 { + return Ok(Some(QueryResult::with_rows(column_names, Vec::new()))); + } + + let mut join_steps = Vec::with_capacity(join_constraints.len()); + for current_table_index in 1..table_bindings.len() { + let Some(step) = self.deferred_view_join_step( + &table_bindings, + &table_schemas, + join_constraints[current_table_index - 1], + current_table_index, + )? + else { + return Ok(None); + }; + join_steps.push(step); + } + + for (binding, schema) in table_bindings.iter().zip(table_schemas.iter()) { + if self.visible_table_row_source(binding.name).is_some() { + continue; + } + let Some(state) = self.persisted_table_state(binding.name) else { + return Ok(None); + }; + let cache = self + .catalog + .table(binding.name) + .and_then(|table| self.deferred_paged_row_locator_caches.get(&table.name)) + .map(|cache| cache.as_ref()); + if !deferred_rowid_lookup_available(state, schema, use_persistent_pk_index, cache) { + return Ok(None); + } + } + + let store = SnapshotPageStore { + pager, + wal, + snapshot_lsn, + }; + let mut rows = Vec::new(); + let mut offset_remaining = offset; + let mut limit_remaining = limit; + let root_binding = table_bindings[0]; + if let Some(source) = self.visible_table_row_source(root_binding.name) { + for root_row in source.rows() { + let root_row = root_row?; + let root_row = StoredRow { + row_id: root_row.row_id(), + values: root_row.values().to_vec(), + }; + if self.push_deferred_view_limit_rows_from_root( + &store, + &table_bindings, + &table_schemas, + &join_steps, + &projections, + root_row, + &mut offset_remaining, + &mut limit_remaining, + &mut rows, + use_persistent_pk_index, + )? { + break; + } + } + } else { + let Some(root_state) = self.persisted_table_state(root_binding.name) else { + return Ok(None); + }; + visit_persisted_table_rows_until(&store, root_state, |row_id, values| { + let root_row = StoredRow { + row_id, + values: values.to_vec(), + }; + self.push_deferred_view_limit_rows_from_root( + &store, + &table_bindings, + &table_schemas, + &join_steps, + &projections, + root_row, + &mut offset_remaining, + &mut limit_remaining, + &mut rows, + use_persistent_pk_index, + ) + })?; + } + + Ok(Some(QueryResult::with_rows(column_names, rows))) + } + + #[allow(clippy::too_many_arguments)] + fn try_execute_simple_deferred_view_filter_projection_query( + &self, + query: &Query, + params: &[Value], + pager: &PagerHandle, + wal: &WalHandle, + snapshot_lsn: u64, + use_persistent_pk_index: bool, + ) -> Result> { + if query.recursive + || !query.ctes.is_empty() + || !query.order_by.is_empty() + || query.limit.is_some() + || query.offset.is_some() + { + return Ok(None); + } + let QueryBody::Select(select) = &query.body else { + return Ok(None); + }; + if select.distinct + || !select.distinct_on.is_empty() + || !select.group_by.is_empty() + || select.having.is_some() + || projection_has_aggregate_items(&select.projection) + || select.from.len() != 1 + { + return Ok(None); + } + let Some(filter) = select.filter.as_ref() else { + return Ok(None); + }; + let FromItem::Table { + name: view_name, + alias: view_alias, + } = &select.from[0] + else { + return Ok(None); + }; + let Some(view) = self.visible_view(view_name, NameResolutionScope::Session) else { + return Ok(None); + }; + if view.temporary { + return Ok(None); + } + let view_binding = view_alias.as_deref().unwrap_or(view_name.as_str()); + let Some((filter_qualifier, filter_column, value_expr)) = simple_btree_lookup(filter) + else { + return Ok(None); + }; + if filter_qualifier.is_some_and(|qualifier| !identifiers_equal(qualifier, view_binding)) { + return Ok(None); + } + + let view_statement = parse_sql_statement(&view.sql_text)?; + let Statement::Query(view_query) = view_statement else { + return Err(DbError::corruption(format!( + "view {} does not contain a SELECT statement", + view.name + ))); + }; + if view_query.recursive + || !view_query.ctes.is_empty() + || !view_query.order_by.is_empty() + || view_query.limit.is_some() + || view_query.offset.is_some() + { + return Ok(None); + } + let QueryBody::Select(view_select) = &view_query.body else { + return Ok(None); + }; + if view_select.distinct + || !view_select.distinct_on.is_empty() + || !view_select.group_by.is_empty() + || view_select.having.is_some() + || view_select.filter.is_some() + || projection_has_aggregate_items(&view_select.projection) + || view_select.from.len() != 1 + { + return Ok(None); + } + + let Some(filter_source_expr) = + view_projection_expr_for_output_column(&view_select.projection, filter_column) + else { + return Ok(None); + }; + let Expr::Column { + table: Some(filter_source_table), + column: filter_source_column, + } = &filter_source_expr + else { + return Ok(None); + }; + + let mut table_bindings = Vec::new(); + let mut join_constraints = Vec::new(); + if !flatten_inner_join_chain( + &view_select.from[0], + &mut table_bindings, + &mut join_constraints, + ) || table_bindings.len() < 2 + || join_constraints.len() + 1 != table_bindings.len() + { + return Ok(None); + } + + let mut table_schemas = Vec::with_capacity(table_bindings.len()); + for binding in &table_bindings { + if self + .visible_view(binding.name, NameResolutionScope::Session) + .is_some() + || self.visible_table_is_temporary(binding.name) + { + return Ok(None); + } + let Some(schema) = self.table_schema(binding.name) else { + return Ok(None); + }; + if !generated_columns_are_stored(schema) { + return Ok(None); + } + table_schemas.push(schema); + } + + let Some(source_table_index) = table_bindings + .iter() + .position(|binding| identifiers_equal(binding.binding_name(), filter_source_table)) + else { + return Ok(None); + }; + if source_table_index != 0 { + return Ok(None); + } + let Some(source_rowid_column) = row_id_alias_column_name(table_schemas[source_table_index]) + else { + return Ok(None); + }; + if !identifiers_equal(source_rowid_column, filter_source_column) { + return Ok(None); + } + + let mut projections = Vec::with_capacity(select.projection.len()); + let mut column_names = Vec::with_capacity(select.projection.len()); + for (ordinal, item) in select.projection.iter().enumerate() { + let SelectItem::Expr { expr, alias } = item else { + return Ok(None); + }; + let Expr::Column { + table: outer_table, + column: outer_column, + } = expr + else { + return Ok(None); + }; + if outer_table + .as_deref() + .is_some_and(|qualifier| !identifiers_equal(qualifier, view_binding)) + { + return Ok(None); + } + let Some(view_expr) = + view_projection_expr_for_output_column(&view_select.projection, outer_column) + else { + return Ok(None); + }; + let Expr::Column { + table: Some(base_table), + column: base_column, + } = view_expr + else { + return Ok(None); + }; + let Some(table_index) = table_bindings + .iter() + .position(|binding| identifiers_equal(binding.binding_name(), &base_table)) + else { + return Ok(None); + }; + let Some(column_index) = schema_column_index(table_schemas[table_index], &base_column) + else { + return Ok(None); + }; + projections.push(DeferredViewProjection { + table_index, + column_index, + }); + column_names.push( + alias + .clone() + .unwrap_or_else(|| infer_expr_name(expr, ordinal + 1)), + ); + } + + let mut join_steps = Vec::with_capacity(join_constraints.len()); + for current_table_index in 1..table_bindings.len() { + let Some(step) = self.deferred_view_join_step( + &table_bindings, + &table_schemas, + join_constraints[current_table_index - 1], + current_table_index, + )? + else { + return Ok(None); + }; + join_steps.push(step); + } + + for (binding, schema) in table_bindings.iter().zip(table_schemas.iter()) { + if self.visible_table_row_source(binding.name).is_some() { + continue; + } + let Some(state) = self.persisted_table_state(binding.name) else { + return Ok(None); + }; + let cache = self + .catalog + .table(binding.name) + .and_then(|table| self.deferred_paged_row_locator_caches.get(&table.name)) + .map(|cache| cache.as_ref()); + if !deferred_rowid_lookup_available(state, schema, use_persistent_pk_index, cache) { + return Ok(None); + } + } + + let filter_value = self.eval_expr( + value_expr, + &Dataset::empty(), + &[], + params, + &BTreeMap::new(), + None, + )?; + let Value::Int64(source_row_id) = filter_value else { + return Ok(Some(QueryResult::with_rows(column_names, Vec::new()))); + }; + let store = SnapshotPageStore { + pager, + wal, + snapshot_lsn, + }; + let Some(source_row) = self.read_simple_view_join_row_at_snapshot( + &store, + table_bindings[source_table_index].name, + table_schemas[source_table_index], + source_row_id, + use_persistent_pk_index, + )? + else { + return Ok(Some(QueryResult::with_rows(column_names, Vec::new()))); + }; + + let mut partial_rows = vec![vec![source_row]]; + for step in join_steps { + let Some(RuntimeIndex::Btree { keys, .. }) = self.index(&step.current_index_name) + else { + return Ok(None); + }; + let mut next_rows = Vec::new(); + for partial in partial_rows { + let Some(key_value) = partial + .get(step.previous_table_index) + .and_then(|row| row.values.get(step.previous_column_index)) + else { + return Err(DbError::internal( + "deferred view join row is shorter than the planned schema", + )); + }; + if matches!(key_value, Value::Null) { + continue; + } + for row_id in keys.row_ids_for_value(key_value)? { + let Some(joined_row) = self.read_simple_view_join_row_at_snapshot( + &store, + table_bindings[step.current_table_index].name, + table_schemas[step.current_table_index], + row_id, + use_persistent_pk_index, + )? + else { + continue; + }; + let mut extended = partial.clone(); + extended.push(joined_row); + next_rows.push(extended); + } + } + partial_rows = next_rows; + if partial_rows.is_empty() { + break; + } + } + + let mut rows = Vec::with_capacity(partial_rows.len()); + for partial in partial_rows { + let mut values = Vec::with_capacity(projections.len()); + for projection in &projections { + let Some(value) = partial + .get(projection.table_index) + .and_then(|row| row.values.get(projection.column_index)) + else { + return Err(DbError::internal( + "deferred view projection row is shorter than the planned schema", + )); + }; + values.push(value.clone()); + } + rows.push(QueryRow::new(values)); + } + Ok(Some(QueryResult::with_rows(column_names, rows))) + } + + fn deferred_view_join_step( + &self, + table_bindings: &[TableBindingRef<'_>], + table_schemas: &[&TableSchema], + constraint: &Expr, + current_table_index: usize, + ) -> Result> { + let Some(equalities) = simple_join_equalities(constraint) else { + return Ok(None); + }; + let current_binding = table_bindings[current_table_index]; + let mut matched = None; + for (left_ref, right_ref) in equalities { + for (previous_ref, current_ref) in [(left_ref, right_ref), (right_ref, left_ref)] { + if !matches_table_binding(current_binding, current_ref.table) { + continue; + } + let Some(previous_table_index) = table_bindings[..current_table_index] + .iter() + .position(|binding| matches_table_binding(*binding, previous_ref.table)) + else { + continue; + }; + let Some(previous_column_index) = + schema_column_index(table_schemas[previous_table_index], previous_ref.column) + else { + return Ok(None); + }; + let Some(current_index) = self + .simple_btree_index_for_table_column(current_binding.name, current_ref.column) + else { + return Ok(None); + }; + if matched + .replace(DeferredViewJoinStep { + previous_table_index, + previous_column_index, + current_table_index, + current_index_name: current_index.name.clone(), + }) + .is_some() + { + return Ok(None); + } + } + } + Ok(matched) + } + + fn simple_btree_index_for_table_column( + &self, + table_name: &str, + column_name: &str, + ) -> Option<&IndexSchema> { + self.catalog.indexes.values().find(|index| { + identifiers_equal(&index.table_name, table_name) + && index.fresh + && index.kind == IndexKind::Btree + && index.predicate_sql.is_none() + && index.columns.len() == 1 + && index.columns[0].expression_sql.is_none() + && index.columns[0] + .column_name + .as_deref() + .is_some_and(|indexed_column| identifiers_equal(indexed_column, column_name)) + && matches!(self.index(&index.name), Some(RuntimeIndex::Btree { .. })) + }) + } + + fn read_simple_view_join_row_at_snapshot( + &self, + store: &S, + table_name: &str, + table_schema: &TableSchema, + row_id: i64, + use_persistent_pk_index: bool, + ) -> Result> { + if let Some(source) = self.visible_table_row_source(table_name) { + return source.row_by_id(row_id).map(|row| { + row.map(|row| StoredRow { + row_id: row.row_id(), + values: row.values().to_vec(), + }) + }); + } + let Some(state) = self.persisted_table_state(table_name) else { + return Ok(None); + }; + let cache = self + .catalog + .table(table_name) + .and_then(|table| self.deferred_paged_row_locator_caches.get(&table.name)) + .map(|cache| cache.as_ref()); + read_deferred_stored_row_by_id( + store, + state, + table_schema, + row_id, + use_persistent_pk_index, + cache, + ) + } + + #[allow(clippy::too_many_arguments)] + fn push_deferred_view_limit_rows_from_root( + &self, + store: &S, + table_bindings: &[TableBindingRef<'_>], + table_schemas: &[&TableSchema], + join_steps: &[DeferredViewJoinStep], + projections: &[DeferredViewProjection], + root_row: StoredRow, + offset_remaining: &mut usize, + limit_remaining: &mut usize, + rows: &mut Vec, + use_persistent_pk_index: bool, + ) -> Result { + let mut partial_rows = vec![vec![root_row]]; + for step in join_steps { + let Some(RuntimeIndex::Btree { keys, .. }) = self.index(&step.current_index_name) + else { + return Err(DbError::internal(format!( + "index {} is missing for deferred view limit join", + step.current_index_name + ))); + }; + let mut next_rows = Vec::new(); + for partial in partial_rows { + let Some(key_value) = partial + .get(step.previous_table_index) + .and_then(|row| row.values.get(step.previous_column_index)) + else { + return Err(DbError::internal( + "deferred view limit join row is shorter than the planned schema", + )); + }; + if matches!(key_value, Value::Null) { + continue; + } + for row_id in keys.row_ids_for_value(key_value)? { + let Some(joined_row) = self.read_simple_view_join_row_at_snapshot( + store, + table_bindings[step.current_table_index].name, + table_schemas[step.current_table_index], + row_id, + use_persistent_pk_index, + )? + else { + continue; + }; + let mut extended = partial.clone(); + extended.push(joined_row); + next_rows.push(extended); + } + } + partial_rows = next_rows; + if partial_rows.is_empty() { + return Ok(false); + } + } + + for partial in partial_rows { + if *offset_remaining > 0 { + *offset_remaining -= 1; + continue; + } + if *limit_remaining == 0 { + return Ok(true); + } + let mut values = Vec::with_capacity(projections.len()); + for projection in projections { + let Some(value) = partial + .get(projection.table_index) + .and_then(|row| row.values.get(projection.column_index)) + else { + return Err(DbError::internal( + "deferred view limit projection row is shorter than the planned schema", + )); + }; + values.push(value.clone()); + } + rows.push(QueryRow::new(values)); + *limit_remaining = (*limit_remaining).saturating_sub(1); + if *limit_remaining == 0 { + return Ok(true); + } + } + Ok(false) + } + pub(crate) fn try_execute_simple_deferred_paged_query( &self, query: &Query, @@ -11603,6 +12385,26 @@ impl EngineRuntime { { return Ok(Some(result)); } + if let Some(result) = self.try_execute_simple_deferred_view_projection_limit_query( + query, + params, + pager, + wal, + snapshot_lsn, + use_persistent_pk_index, + )? { + return Ok(Some(result)); + } + if let Some(result) = self.try_execute_simple_deferred_view_filter_projection_query( + query, + params, + pager, + wal, + snapshot_lsn, + use_persistent_pk_index, + )? { + return Ok(Some(result)); + } if let Some(result) = self.try_execute_simple_deferred_rowid_join_projection_query( query, params, @@ -14452,6 +15254,21 @@ impl EngineRuntime { return Ok(None); } let row_source = self.table_row_source(table_name); + if row_id_alias_column_name(table) + .is_some_and(|row_id_column| identifiers_equal(row_id_column, column_name)) + { + let Some(row_source) = row_source else { + return Ok(None); + }; + let value = self.eval_expr(value_expr, &Dataset::empty(), &[], params, ctes, None)?; + let row_ids = match value { + Value::Int64(row_id) => RuntimeRowIdSet::Single(row_id), + _ => RuntimeRowIdSet::Empty, + }; + return self + .dataset_from_row_id_set(table, Some(row_source), alias, row_ids, false) + .map(Some); + } let Some(index) = self.catalog.indexes.values().find(|index| { identifiers_equal(&index.table_name, table_name) && index.fresh @@ -18275,6 +19092,81 @@ where Ok(total_row_count) } +fn visit_persisted_table_rows_until( + store: &S, + state: PersistedTableState, + mut visitor: F, +) -> Result +where + F: FnMut(i64, &[Value]) -> Result, +{ + if state.pointer.head_page_id == 0 || state.pointer.logical_len == 0 { + return Ok(0); + } + if !state.pointer.is_table_paged_manifest() { + let mut stopped = false; + let row_count = + visit_table_payload_rows_from_pointer(store, state.pointer, &mut |row_id, values| { + if stopped { + return Ok(()); + } + stopped = visitor(row_id, values)?; + Ok(()) + })?; + if !stopped && state.row_count != 0 && row_count != state.row_count { + return Err(DbError::corruption("table payload row count mismatch")); + } + return Ok(row_count); + } + + let manifest_payload = read_overflow(store, state.pointer)?; + if crc32c_parts(&[manifest_payload.as_slice()]) != state.checksum { + return Err(DbError::corruption( + "paged table manifest checksum mismatch", + )); + } + let manifest = decode_paged_table_manifest_payload(&manifest_payload)?; + let mut visited_row_count = 0usize; + let mut total_row_count = 0usize; + for chunk in manifest.chunks { + let mut count = 0usize; + + let base_payload = read_overflow(store, chunk.pointer)?; + for row in decode_table_payload_rows(&base_payload)? { + if chunk.tombstoned_row_ids.contains(&row.row_id) { + continue; + } + visited_row_count = visited_row_count.saturating_add(1); + count += 1; + if visitor(row.row_id, &row.values)? { + return Ok(visited_row_count); + } + } + + if let Some(overlay_pointer) = chunk.overlay_pointer { + let overlay_payload = read_overflow(store, overlay_pointer)?; + for row in decode_table_payload_rows(&overlay_payload)? { + visited_row_count = visited_row_count.saturating_add(1); + count += 1; + if visitor(row.row_id, &row.values)? { + return Ok(visited_row_count); + } + } + } + + if count != chunk.row_count { + return Err(DbError::corruption("paged table chunk row count mismatch")); + } + total_row_count = total_row_count.saturating_add(count); + } + if state.row_count != 0 && total_row_count != state.row_count { + return Err(DbError::corruption( + "paged table manifest row count mismatch", + )); + } + Ok(visited_row_count) +} + fn decode_table_payload_rows(bytes: &[u8]) -> Result> { let mut rows = Vec::new(); visit_table_payload_rows_from_bytes(bytes, &mut |row_id, values| { @@ -21611,6 +22503,20 @@ struct IndexedJoinPlan<'a> { filtered_on_left: bool, } +#[derive(Clone, Copy, Debug)] +struct DeferredViewProjection { + table_index: usize, + column_index: usize, +} + +#[derive(Clone, Debug)] +struct DeferredViewJoinStep { + previous_table_index: usize, + previous_column_index: usize, + current_table_index: usize, + current_index_name: String, +} + fn table_output_columns(table: &TableSchema, alias: &Option) -> Vec { let table_name = alias.clone().unwrap_or_else(|| table.name.clone()); table @@ -21620,6 +22526,33 @@ fn table_output_columns(table: &TableSchema, alias: &Option) -> Vec( + item: &'a FromItem, + tables: &mut Vec>, + constraints: &mut Vec<&'a Expr>, +) -> bool { + match item { + FromItem::Table { name, alias } => { + tables.push(TableBindingRef { name, alias }); + true + } + FromItem::Join { + left, + right, + kind: JoinKind::Inner, + constraint: JoinConstraint::On(on), + } => { + flatten_inner_join_chain(left, tables, constraints) + && flatten_inner_join_chain(right, tables, constraints) + && { + constraints.push(on); + true + } + } + _ => false, + } +} + fn spatial_join_argument_orientation<'a>( left_binding: TableBindingRef<'a>, right_binding: TableBindingRef<'a>, @@ -32505,7 +33438,7 @@ mod tests { use crate::record::compression::CompressionMode; use crate::search::TrigramQueryResult; - use crate::sql::ast::FromItem; + use crate::sql::ast::{Expr, FromItem}; use crate::sql::parser::parse_sql_statement; use crate::storage::checksum::crc32c_parts; use crate::storage::page::InMemoryPageStore; @@ -33705,6 +34638,68 @@ mod tests { ); } + #[test] + fn indexed_table_lookup_filters_rowid_alias_without_secondary_index() { + let mut runtime = EngineRuntime::empty(1); + execute_sql( + &mut runtime, + "CREATE TABLE lookup_users (id INT64 PRIMARY KEY, name TEXT)", + ); + for id in 1..=3 { + execute_sql( + &mut runtime, + &format!("INSERT INTO lookup_users (id, name) VALUES ({id}, 'u{id}')"), + ); + } + + let lookup = runtime + .indexed_table_lookup( + "lookup_users", + &Some("u".to_string()), + "id", + &Expr::Parameter(1), + &[Value::Int64(2)], + &BTreeMap::new(), + ) + .expect("lookup should execute") + .expect("rowid alias lookup should be handled without a secondary index"); + assert_eq!(lookup.rows.len(), 1); + assert_eq!( + lookup.rows[0], + vec![Value::Int64(2), Value::Text("u2".to_string())] + ); + assert!(lookup + .columns + .iter() + .all(|column| column.table.as_deref().is_some_and(|table| table == "u"))); + + let missing = runtime + .indexed_table_lookup( + "lookup_users", + &None, + "id", + &Expr::Parameter(1), + &[Value::Int64(99)], + &BTreeMap::new(), + ) + .expect("missing lookup should execute") + .expect("rowid alias lookup should still claim the fast path"); + assert!(missing.rows.is_empty()); + + let wrong_type = runtime + .indexed_table_lookup( + "lookup_users", + &None, + "id", + &Expr::Parameter(1), + &[Value::Text("2".to_string())], + &BTreeMap::new(), + ) + .expect("typed miss should execute") + .expect("rowid alias lookup should return an empty fast-path result"); + assert!(wrong_type.rows.is_empty()); + } + #[test] fn indexed_right_join_with_right_table_probe_preserves_unmatched_right_rows() { let mut runtime = EngineRuntime::empty(1); @@ -36461,6 +37456,13 @@ mod tests { panic!("expected query"); }; + assert_eq!( + runtime + .indexed_join_grouped_count_parent_table_name(query, &[]) + .expect("analyze") + .expect("indexed grouped count parent table"), + "artists" + ); let result = runtime .try_execute_indexed_join_grouped_count_query(query, &[]) .expect("execute") @@ -36677,6 +37679,85 @@ mod tests { ); } + #[test] + fn view_filter_pushdown_can_prefilter_rowid_alias_join_chain() { + let mut runtime = EngineRuntime::empty(1); + execute_sql( + &mut runtime, + "CREATE TABLE artists (id INTEGER PRIMARY KEY, name TEXT NOT NULL)", + ); + execute_sql( + &mut runtime, + "CREATE TABLE albums (id INTEGER PRIMARY KEY, artist_id INTEGER NOT NULL, title TEXT)", + ); + execute_sql( + &mut runtime, + "CREATE TABLE songs (id INTEGER PRIMARY KEY, album_id INTEGER NOT NULL, title TEXT)", + ); + execute_sql( + &mut runtime, + "CREATE INDEX idx_albums_artist ON albums (artist_id)", + ); + execute_sql( + &mut runtime, + "CREATE INDEX idx_songs_album ON songs (album_id)", + ); + execute_sql( + &mut runtime, + "INSERT INTO artists (id, name) VALUES (1, 'a')", + ); + execute_sql( + &mut runtime, + "INSERT INTO artists (id, name) VALUES (2, 'b')", + ); + execute_sql( + &mut runtime, + "INSERT INTO albums (id, artist_id, title) VALUES (10, 1, 'a1')", + ); + execute_sql( + &mut runtime, + "INSERT INTO albums (id, artist_id, title) VALUES (20, 2, 'b1')", + ); + execute_sql( + &mut runtime, + "INSERT INTO songs (id, album_id, title) VALUES (100, 10, 's1')", + ); + execute_sql( + &mut runtime, + "INSERT INTO songs (id, album_id, title) VALUES (101, 10, 's2')", + ); + execute_sql( + &mut runtime, + "INSERT INTO songs (id, album_id, title) VALUES (200, 20, 's3')", + ); + + let statement = parse_sql_statement( + "SELECT a.id AS artist_id, a.name AS artist_name, al.title AS album_title, \ + s.title AS song_title \ + FROM artists a JOIN albums al ON al.artist_id = a.id \ + JOIN songs s ON s.album_id = al.id \ + WHERE a.id = $1", + ) + .expect("parse"); + let crate::sql::ast::Statement::Query(query) = &statement else { + panic!("expected query"); + }; + let crate::sql::ast::QueryBody::Select(select) = &query.body else { + panic!("expected select"); + }; + + let dataset = runtime + .try_indexed_prefiltered_inner_join_tree(select, &[Value::Int64(1)], &BTreeMap::new()) + .expect("execute") + .expect("rowid-filtered view join chain should stay on the prefiltered fast path"); + + assert_eq!(dataset.rows.len(), 2); + assert!(dataset + .rows + .iter() + .all(|row| row.first() == Some(&Value::Int64(1)))); + } + #[test] fn rowid_range_projection_allows_order_by_unprojected_key() { let mut runtime = EngineRuntime::empty(1); diff --git a/design/METRIC_IMPROVEMENTS_PLAN.md b/design/METRIC_IMPROVEMENTS_PLAN.md new file mode 100644 index 0000000..3b94dc3 --- /dev/null +++ b/design/METRIC_IMPROVEMENTS_PLAN.md @@ -0,0 +1,172 @@ +# Metric Improvements Plan + +Date: 2026-06-11 + +This document tracks the performance work needed to improve DecentDB against the +metrics used in the public README benchmark charts and the rust-baseline +SQLite comparison workload. + +The task is complete when the priority metrics below have improved from this +baseline without regressing durability or correctness. + +## Mission Context + +DecentDB started as a fun embedded-database project, but it has progressed into +a useful Rust-native engine with capabilities that make it more than a SQLite +clone: native types, modern language bindings, durable WAL behavior, richer +developer ergonomics, and room for engine-level features that SQLite does not +provide directly. + +The current mission is to make DecentDB performance credible on the same terms +users already use to judge SQLite: + +- DecentDB should be at least on par with SQLite for core embedded database + operations. +- Where DecentDB has architectural or feature advantages, the goal is to beat + SQLite rather than merely avoid being slow. +- Current wins in bulk insert throughput, counts, aggregates, grouped Top-N + queries, and total rust-baseline runtime must be protected. +- The remaining credibility gaps are the operations users expect SQLite to be + excellent at: point lookup latency, indexed/range scans, joins, and view + expansion. +- Public-facing proof matters. Improvements should show up in the README chart + metrics and in the rust-baseline SQLite comparison, not only in isolated + microbenchmarks. +- Benchmark-only tricks are not acceptable. Do not weaken durability, bypass + correctness, add SQLite comparison behavior to the engine core, or optimize + only a binding when the bottleneck is in the engine. + +## Baseline Sources + +| Source | Path / command | Role | Baseline state | +|---|---|---|---| +| Public README benchmark summary | `data/bench_summary.json` | Source for README chart metrics | Matches local `main:data/bench_summary.json` at `9826f6e387a843745958c6bfbabd979e8f90ee3d` | +| Public README chart renderers | `scripts/make_readme_chart.py`, `scripts/visualize_alternative.py` | Normalize chart values vs SQLite and render assets | SQLite baseline is `sqlite`; higher normalized score is better | +| Native chart workload | `cargo bench -p decentdb --bench embedded_compare` | Generates the public benchmark summary | 5 statistical runs per engine | +| Rust diagnostic workload | `benchmarks/rust-baseline` | Large music-library apples-to-apples DecentDB vs SQLite comparison | Fresh run in `.tmp/rust-baseline-sqlite-compare-20260611-152618/results` | + +Public chart ratios below use the same convention as the README speedup chart: +`> 1.00x` means DecentDB is faster or more efficient than SQLite. For latency +metrics this is `sqlite_latency / decentdb_latency`; for throughput it is +`decentdb_throughput / sqlite_throughput`. + +## Public README Metrics Baseline + +These are the metrics currently used by the public benchmark images. + +| Priority | Metric | Workload meaning | SQLite baseline | DecentDB balanced | Balanced vs SQLite | DecentDB tuned | Tuned vs SQLite | Current status | +|---:|---|---|---:|---:|---:|---:|---:|---| +| 1 | `read_p95_ms` | p95 prepared point lookup latency | 0.002841 ms | 0.015485 ms | 0.18x | 0.001997 ms | 1.42x | Tuned wins; balanced loses | +| 2 | `range_scan_p95_ms` | p95 ordered 50-row range scan latency | 0.011001 ms | 0.625215 ms | 0.02x | 0.012359 ms | 0.89x | Tuned slightly behind | +| 3 | `join_p95_ms` | p95 prepared inner join lookup latency | 0.003222 ms | 0.028585 ms | 0.11x | 0.003350 ms | 0.96x | Tuned near parity, still behind | +| 4 | `commit_p95_ms` | p95 durable single-row auto-commit insert latency | 0.488442 ms | 0.906035 ms | 0.54x | 0.462217 ms | 1.06x | Tuned wins narrowly | +| 5 | `concurrent_read_p95_ms` | p95 point lookup latency across 4 reader threads | 0.038827 ms | 0.045370 ms | 0.86x | 0.004815 ms | 8.06x | Tuned wins strongly | +| 6 | `aggregate_p95_ms` | p95 prepared `COUNT/SUM` aggregate latency | 0.035156 ms | 0.127360 ms | 0.28x | 0.030653 ms | 1.15x | Tuned wins | +| 7 | `insert_rows_per_sec` | prepared single-row insert loop inside one explicit transaction | 2,089,870 rows/s | 2,657,466 rows/s | 1.27x | 3,251,229 rows/s | 1.56x | DecentDB wins | + +Notes: + +- The public charts currently include multiple DecentDB profiles. The tuned row + is the strongest public performance story, but balanced and low-memory rows + are still visible and must not regress. +- Storage size, WAL size, and metric standard deviations exist in + `data/bench_summary.json`, but they are not currently rendered in the README + images. + +## Rust-Baseline SQLite Comparison + +The rust-baseline workload is not the public README chart input. It is a larger +diagnostic workload that has been useful for finding engine bottlenecks in +realistic joins, views, and grouped aggregates. + +Fresh all-scale comparison from +`.tmp/rust-baseline-sqlite-compare-20260611-152618/results`: + +| Scale | DecentDB total | SQLite total | SQLite / DecentDB | Winner | Current interpretation | +|---|---:|---:|---:|---|---| +| smoke | 0.052037 s | 0.085686 s | 1.65x | DecentDB | DecentDB wins total runtime | +| medium | 0.326903 s | 0.662650 s | 2.03x | DecentDB | DecentDB wins total runtime | +| full | 3.248880 s | 6.628345 s | 2.04x | DecentDB | DecentDB wins total runtime | +| huge | 24.200240 s | 33.569571 s | 1.39x | DecentDB | DecentDB wins total runtime | + +Important remaining rust-baseline losses: + +| Scale | `query_artist_by_id` SQLite / DecentDB | `query_view_first_1000` SQLite / DecentDB | `query_songs_for_artist_via_view` SQLite / DecentDB | Interpretation | +|---|---:|---:|---:|---| +| smoke | 0.36x | 0.08x | 0.04x | SQLite wins tiny lookup and view paths | +| medium | 0.44x | 0.03x | 0.05x | SQLite wins tiny lookup and view paths | +| full | 0.64x | 0.03x | 0.08x | SQLite wins tiny lookup and view paths | +| huge | 0.94x | 0.03x | 0.06x | PK lookup nearly tied; view paths still behind | + +Important rust-baseline wins to protect: + +| Scale | `seed_songs` SQLite / DecentDB | `query_count_songs` SQLite / DecentDB | `query_aggregate_durations` SQLite / DecentDB | `query_top10_artists_by_songs` SQLite / DecentDB | `query_top10_albums_by_songs` SQLite / DecentDB | +|---|---:|---:|---:|---:|---:| +| smoke | 1.89x | 1.03x | 3.82x | 7.51x | 2.45x | +| medium | 2.01x | 35.93x | 4.51x | 11.85x | 2.38x | +| full | 1.88x | 232.10x | 4.96x | 15.54x | 2.77x | +| huge | 1.15x | 1504.83x | 3.80x | 12.77x | 2.93x | + +## Recommended Priority Order + +| Rank | Priority metric / area | Public chart coverage | Rust-baseline coverage | Baseline status | Target | +|---:|---|---|---|---|---| +| 1 | Point lookup latency | `read_p95_ms` | `query_artist_by_id` | Tuned public row wins, balanced loses, rust-baseline still slightly loses at all scales | Improve core point lookup latency; keep tuned ahead and close rust-baseline gap | +| 2 | Range scan latency | `range_scan_p95_ms` | Partial overlap through indexed scans and view paths | Tuned public row is 0.89x vs SQLite | Bring tuned above 1.00x vs SQLite and reduce balanced gap | +| 3 | Join and view lookup latency | `join_p95_ms` | `query_view_first_1000`, `query_songs_for_artist_via_view` | Tuned public row is 0.96x; rust-baseline view paths lose strongly | Bring public join above 1.00x and reduce view-path latency materially | +| 4 | Durable commit latency | `commit_p95_ms` | Not directly represented in rust-baseline totals | Tuned public row wins narrowly at 1.06x | Protect or improve without weakening ACID guarantees | +| 5 | Concurrent read latency | `concurrent_read_p95_ms` | Not directly represented in rust-baseline | Tuned public row wins strongly | Protect; watch for reader-cache or locking regressions | +| 6 | Aggregate latency | `aggregate_p95_ms` | `query_aggregate_durations`, grouped Top-N queries | Tuned public row wins; rust-baseline wins strongly | Protect wins; optimize only if shared hot-path work helps higher priorities | +| 7 | Insert throughput | `insert_rows_per_sec` | `seed_songs` and seed loops | DecentDB wins public and rust-baseline insert paths | Protect wins; avoid trading write durability for chart gains | +| 8 | Size and memory | Stored in summary, not charted | RSS, DB size, WAL size in rust-baseline JSON | Not public-charted today | Track opportunistically; consider adding public visibility later | + +## Execution Plan + +1. Profile point lookup and indexed lookup paths. + - Use `cargo bench -p decentdb --bench embedded_compare` for `read_p95_ms`. + - Use `benchmarks/rust-baseline` for `query_artist_by_id`. + - Candidate areas: prepared plan dispatch, rowid lookup, B-tree traversal, + row materialization, and result construction. + +2. Profile range scan and join/view paths. + - Use public metrics `range_scan_p95_ms` and `join_p95_ms`. + - Use rust-baseline view steps to catch larger real-query behavior. + - Candidate areas: indexed range iteration, deferred row retrieval, view + expansion, join execution, and repeated small-query planning overhead. + +3. Preserve durable write behavior. + - Any change touching WAL, commit, checkpoint, or sync behavior must protect + `commit_p95_ms` and pass crash/durability validation. + - Do not relax durability settings to improve charts. + +4. Re-run both benchmark surfaces after each meaningful optimization. + - Public chart surface: + `cargo bench -p decentdb --bench embedded_compare` + - Merge and render, only in the release benchmark lane: + `python scripts/aggregate_benchmarks.py` + `python scripts/make_readme_chart.py` + `python scripts/visualize_alternative.py` + - Diagnostic surface: + `benchmarks/rust-baseline` all scales for `--engine decentdb` and + `--engine sqlite`. + +5. Update this document after every accepted improvement. + - Record commit or worktree label. + - Record benchmark command and output directory. + - Update baseline, current, ratio, and status columns. + - Note regressions explicitly, even when the headline metric improves. + +## Completion Criteria + +This task is complete when: + +- The public README metric set has no priority regression versus this baseline. +- Tuned DecentDB is at or above SQLite on all public chart metrics, or any + remaining exception has a documented reason and a follow-up issue. +- Balanced and low-memory DecentDB profiles are not worsened by tuned-profile + improvements. +- Rust-baseline retains DecentDB total-runtime wins at all scales. +- Rust-baseline tiny lookup and view-path losses are materially reduced, with + updated numbers in this document. +- Durability semantics remain unchanged unless covered by an ADR and matching + migration/recovery validation. From 07eae9d800e9b279bf2e0e69af86355651761aa5 Mon Sep 17 00:00:00 2001 From: Steven Hildreth Date: Thu, 11 Jun 2026 18:01:29 -0500 Subject: [PATCH 5/9] feat(exec): optimize point lookup performance with caching and fast path execution --- crates/decentdb/src/db.rs | 241 ++++++++++++++++++++++------- crates/decentdb/src/exec/mod.rs | 228 +++++++++++++++++++++++---- design/METRIC_IMPROVEMENTS_PLAN.md | 71 ++++++++- 3 files changed, 454 insertions(+), 86 deletions(-) diff --git a/crates/decentdb/src/db.rs b/crates/decentdb/src/db.rs index 6ebdadc..54779a8 100644 --- a/crates/decentdb/src/db.rs +++ b/crates/decentdb/src/db.rs @@ -6,7 +6,7 @@ use std::collections::{BTreeMap, BTreeSet}; use std::path::{Path, PathBuf}; use std::str::FromStr; use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; -use std::sync::{Arc, Mutex, OnceLock, RwLock, RwLockWriteGuard, Weak}; +use std::sync::{Arc, Mutex, OnceLock, RwLock, RwLockReadGuard, RwLockWriteGuard, Weak}; use std::time::{Duration, SystemTime}; #[cfg(feature = "bench-internals")] @@ -21,11 +21,14 @@ use crate::catalog::{ }; use crate::config::{DbConfig, ProcessCoordinationMode, WalSyncMode}; use crate::error::{DbError, Result}; -use crate::exec::dml::{PreparedSimpleDelete, PreparedSimpleInsert, PreparedSimpleUpdate}; +use crate::exec::dml::{ + row_id_alias_column_name, PreparedSimpleDelete, PreparedSimpleInsert, PreparedSimpleUpdate, +}; use crate::exec::{ decode_paged_table_manifest_payload, read_table_payload_row_count_from_bytes, row_satisfies_expression, statement_is_read_only, BulkLoadOptions, EngineRuntime, QueryResult, - QueryRow, RuntimeIndex, SimpleRowIdProjectionRequest, TableData, + QueryRow, ResolvedSimpleRowIdProjectionRequest, RuntimeIndex, SimpleRowIdProjectionRequest, + TableData, }; use crate::metadata::{ CheckConstraintInfo, ColumnInfo, ForeignKeyInfo, HeaderInfo, IndexInfo, IndexVerification, @@ -187,8 +190,8 @@ pub struct PreparedStatement { #[derive(Clone, Debug)] struct PreparedSimpleRowIdProjection { table_name: String, - projection_columns: Vec, - filter_column: String, + projection_indexes: Vec, + column_names: Vec, param_index: usize, } @@ -1475,6 +1478,31 @@ impl Db { /// Executes a single SQL statement with positional `$n` parameters. pub fn execute_with_params(&self, sql: &str, params: &[Value]) -> Result { + if let Some(trimmed) = simple_single_statement_fast_path_sql(sql) { + if let Some(result) = self.try_execute_simple_count_sql_fast_path(trimmed, params)? { + self.record_statement_trace( + trimmed, + true, + std::time::Duration::ZERO, + 0, + Ok(&result), + ); + return Ok(result); + } + if let Some(result) = + self.try_execute_simple_row_id_projection_sql_fast_path(trimmed, params)? + { + self.record_statement_trace( + trimmed, + true, + std::time::Duration::ZERO, + 0, + Ok(&result), + ); + return Ok(result); + } + } + let mut results = self.execute_batch_with_params(sql, params)?; if results.len() != 1 { return Err(DbError::sql(format!( @@ -2209,14 +2237,9 @@ impl Db { let reader = self.inner.wal.begin_reader_with_pager(&self.inner.pager)?; let snapshot_lsn = reader.snapshot_lsn(); self.refresh_engine_from_snapshot(snapshot_lsn)?; - if self.ensure_security_tables_loaded_at_snapshot(snapshot_lsn)? { + let Some(runtime) = self.runtime_read_for_fast_read_at_snapshot(snapshot_lsn)? else { return Ok(None); - } - let runtime = self - .inner - .engine - .read() - .map_err(|_| DbError::internal("engine runtime lock poisoned"))?; + }; let Some(table) = runtime.catalog.table(plan.table_name) else { return Ok(None); }; @@ -2262,14 +2285,9 @@ impl Db { let reader = self.inner.wal.begin_reader_with_pager(&self.inner.pager)?; let snapshot_lsn = reader.snapshot_lsn(); self.refresh_engine_from_snapshot(snapshot_lsn)?; - if self.ensure_security_tables_loaded_at_snapshot(snapshot_lsn)? { + let Some(runtime) = self.runtime_read_for_fast_read_at_snapshot(snapshot_lsn)? else { return Ok(None); - } - let runtime = self - .inner - .engine - .read() - .map_err(|_| DbError::internal("engine runtime lock poisoned"))?; + }; let result = runtime.execute_simple_row_id_projection_at_snapshot(SimpleRowIdProjectionRequest { table_name: plan.table_name, @@ -4311,35 +4329,26 @@ impl Db { let reader = self.inner.wal.begin_reader_with_pager(&self.inner.pager)?; let snapshot_lsn = reader.snapshot_lsn(); self.refresh_engine_from_snapshot(snapshot_lsn)?; - if self.ensure_security_tables_loaded_at_snapshot(snapshot_lsn)? { + let Some(runtime) = self.runtime_read_for_fast_read_at_snapshot(snapshot_lsn)? else { return Ok(None); - } - let runtime = self - .inner - .engine - .read() - .map_err(|_| DbError::internal("engine runtime lock poisoned"))?; + }; self.validate_prepared_schema_cookie( prepared, runtime.catalog.schema_cookie, runtime.temp_schema_cookie, )?; - let projection_columns = plan - .projection_columns - .iter() - .map(String::as_str) - .collect::>(); - let result = - runtime.execute_simple_row_id_projection_at_snapshot(SimpleRowIdProjectionRequest { + let result = runtime.execute_resolved_simple_row_id_projection_at_snapshot( + ResolvedSimpleRowIdProjectionRequest { table_name: plan.table_name.as_str(), - projection_columns: &projection_columns, - filter_column: plan.filter_column.as_str(), + projection_indexes: &plan.projection_indexes, + column_names: &plan.column_names, lookup_row_id: *lookup_row_id, pager: &self.inner.pager, wal: &self.inner.wal, snapshot_lsn, use_persistent_pk_index: self.inner.config.persistent_pk_index, - })?; + }, + )?; drop(runtime); drop(reader); Ok(result) @@ -5648,18 +5657,7 @@ impl Db { _ => (None, None, None), }; let simple_row_id_projection = - parse_simple_row_id_projection_sql(&prepared_sql).map(|plan| { - PreparedSimpleRowIdProjection { - table_name: plan.table_name.to_string(), - projection_columns: plan - .projection_columns - .into_iter() - .map(str::to_string) - .collect(), - filter_column: plan.filter_column.to_string(), - param_index: plan.param_index, - } - }); + Self::prepared_simple_row_id_projection(&prepared_sql, runtime); Ok(PreparedStatement { db: self.clone(), schema_cookie: runtime.catalog.schema_cookie, @@ -5674,6 +5672,46 @@ impl Db { }) } + fn prepared_simple_row_id_projection( + sql: &str, + runtime: &EngineRuntime, + ) -> Option { + let plan = parse_simple_row_id_projection_sql(sql)?; + if runtime.temp_table_schema(plan.table_name).is_some() + || runtime + .catalog + .views + .keys() + .any(|view_name| identifiers_equal(view_name, plan.table_name)) + { + return None; + } + let table = runtime.catalog.table(plan.table_name)?; + if !row_id_alias_column_name(table) + .is_some_and(|column_name| identifiers_equal(column_name, plan.filter_column)) + { + return None; + } + + let mut projection_indexes = Vec::with_capacity(plan.projection_columns.len()); + let mut column_names = Vec::with_capacity(plan.projection_columns.len()); + for projection_column in plan.projection_columns { + let index = table + .columns + .iter() + .position(|column| identifiers_equal(&column.name, projection_column))?; + projection_indexes.push(index); + column_names.push(projection_column.to_string()); + } + + Some(PreparedSimpleRowIdProjection { + table_name: table.name.clone(), + projection_indexes, + column_names, + param_index: plan.param_index, + }) + } + fn prepared_simple_insert( &self, sql: &str, @@ -6278,7 +6316,6 @@ impl Db { names: &[&str], snapshot_lsn: Option, ) -> Result<()> { - let filter: BTreeSet = names.iter().map(|s| s.to_string()).collect(); { let runtime = self .inner @@ -6295,6 +6332,7 @@ impl Db { return Ok(()); } } + let filter: BTreeSet = names.iter().map(|s| s.to_string()).collect(); let mut runtime = self .inner .engine @@ -6342,6 +6380,46 @@ impl Db { ] } + fn runtime_has_deferred_security_tables(runtime: &EngineRuntime) -> bool { + runtime.has_deferred_tables() + && Self::security_catalog_table_names().iter().any(|name| { + runtime + .deferred_table_names() + .any(|candidate| candidate.eq_ignore_ascii_case(name)) + }) + } + + fn runtime_read_for_fast_read_at_snapshot( + &self, + snapshot_lsn: u64, + ) -> Result>> { + let runtime = self + .inner + .engine + .read() + .map_err(|_| DbError::internal("engine runtime lock poisoned"))?; + if Self::runtime_has_deferred_security_tables(&runtime) { + drop(runtime); + self.ensure_tables_loaded_at_snapshot( + &Self::security_catalog_table_names(), + Some(snapshot_lsn), + )?; + let runtime = self + .inner + .engine + .read() + .map_err(|_| DbError::internal("engine runtime lock poisoned"))?; + if runtime.security_rules_active()? { + return Ok(None); + } + return Ok(Some(runtime)); + } + if runtime.security_rules_active()? { + return Ok(None); + } + Ok(Some(runtime)) + } + fn ensure_security_tables_loaded_at_snapshot(&self, snapshot_lsn: u64) -> Result { self.ensure_tables_loaded_at_snapshot( &Self::security_catalog_table_names(), @@ -13951,6 +14029,18 @@ struct SimpleRowIdProjectionSqlPlan<'a> { param_index: usize, } +fn simple_single_statement_fast_path_sql(sql: &str) -> Option<&str> { + let trimmed = sql.trim(); + if trimmed.is_empty() { + return None; + } + let trimmed = trimmed.strip_suffix(';').unwrap_or(trimmed).trim_end(); + if trimmed.is_empty() || trimmed.contains(';') { + return None; + } + Some(trimmed) +} + fn parse_simple_count_star_sql(sql: &str) -> Option> { const PREFIX: &str = "select count(*) from "; let trimmed = sql.trim(); @@ -13972,18 +14062,17 @@ fn parse_simple_row_id_projection_sql(sql: &str) -> Option Option Option { + if needle.is_empty() { + return Some(0); + } + haystack + .as_bytes() + .windows(needle.len()) + .position(|window| ascii_bytes_eq_ignore_ascii_case(window, needle.as_bytes())) +} + +fn contains_ascii_case_insensitive(haystack: &str, needle: &str) -> bool { + find_ascii_case_insensitive(haystack, needle).is_some() +} + +fn ascii_bytes_eq_ignore_ascii_case(left: &[u8], right: &[u8]) -> bool { + left.len() == right.len() + && left + .iter() + .zip(right.iter()) + .all(|(left, right)| left.eq_ignore_ascii_case(right)) +} + fn parse_positional_param(sql: &str) -> Option { let number = sql.strip_prefix('$')?; if number.is_empty() || !number.bytes().all(|byte| byte.is_ascii_digit()) { @@ -16272,8 +16383,9 @@ mod tests { use crate::{BulkLoadOptions, Db, QueuedWriteOptions, Value, WalSyncMode}; use super::{ - parse_simple_count_star_sql, parse_simple_row_id_projection_sql, split_sql_batch, - PreparedInsertCache, StatementCache, TempSchemaState, + parse_simple_count_star_sql, parse_simple_row_id_projection_sql, + simple_single_statement_fast_path_sql, split_sql_batch, PreparedInsertCache, + StatementCache, TempSchemaState, }; #[derive(Debug)] @@ -16518,6 +16630,17 @@ mod tests { .is_none()); } + #[test] + fn single_statement_fast_path_accepts_optional_trailing_semicolon_only() { + assert_eq!( + simple_single_statement_fast_path_sql(" SELECT id FROM artists WHERE id = $1; "), + Some("SELECT id FROM artists WHERE id = $1") + ); + assert!(simple_single_statement_fast_path_sql("").is_none()); + assert!(simple_single_statement_fast_path_sql("SELECT 1; SELECT 2").is_none()); + assert!(simple_single_statement_fast_path_sql("SELECT 1;;").is_none()); + } + #[test] fn drop_does_not_block_indefinitely() -> Result<()> { let dir = TempDir::new().expect("create temp dir"); diff --git a/crates/decentdb/src/exec/mod.rs b/crates/decentdb/src/exec/mod.rs index 5c43dbc..842cd16 100644 --- a/crates/decentdb/src/exec/mod.rs +++ b/crates/decentdb/src/exec/mod.rs @@ -103,6 +103,7 @@ const ENUM_COLUMNS_SECTION_MAGIC: &[u8; 8] = b"DDBENU01"; const FULL_TEXT_OPTIONS_SECTION_MAGIC: &[u8; 8] = b"DDBFTS01"; const SIGNED_ROW_ID_BIAS: u64 = 0x8000_0000_0000_0000; const DEFERRED_COMPRESSED_LOOKUP_CACHE_LIMIT: usize = 32; +const DEFERRED_PAGED_ROW_PAYLOAD_CACHE_LIMIT_BYTES: usize = 8 * 1024 * 1024; const DEFERRED_VIEW_LIMIT_MIN_PERSISTED_ROWS: usize = 100_000; static RANDOM_STATE: AtomicU64 = AtomicU64::new(0); static DEFERRED_COMPRESSED_LOOKUP_CACHE: OnceLock> = @@ -228,17 +229,43 @@ struct CachedPagedRowLocator { locator: RowLocatorV1, } +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +struct CachedPagedChunkPayloadKey { + head_page_id: PageId, + logical_len: u32, + flags: u8, + checksum: u32, +} + +impl CachedPagedChunkPayloadKey { + fn new(pointer: OverflowPointer, checksum: u32) -> Self { + Self { + head_page_id: pointer.head_page_id, + logical_len: pointer.logical_len, + flags: pointer.flags, + checksum, + } + } +} + #[derive(Debug)] struct DeferredPagedRowLocatorCache { manifest_pointer: OverflowPointer, manifest_checksum: u32, - locators: BTreeMap, + locators: Int64Map, + verified_payloads: HashMap>>, } impl DeferredPagedRowLocatorCache { fn matches_state(&self, state: PersistedTableState) -> bool { self.manifest_pointer == state.pointer && self.manifest_checksum == state.checksum } + + fn verified_payload(&self, pointer: OverflowPointer, checksum: u32) -> Option<&[u8]> { + self.verified_payloads + .get(&CachedPagedChunkPayloadKey::new(pointer, checksum)) + .map(|payload| payload.as_slice()) + } } #[derive(Debug, Default)] @@ -1734,6 +1761,28 @@ pub(crate) struct SimpleRowIdProjectionRequest<'a> { pub(crate) use_persistent_pk_index: bool, } +pub(crate) struct ResolvedSimpleRowIdProjectionRequest<'a> { + pub(crate) table_name: &'a str, + pub(crate) projection_indexes: &'a [usize], + pub(crate) column_names: &'a [String], + pub(crate) lookup_row_id: i64, + pub(crate) pager: &'a PagerHandle, + pub(crate) wal: &'a WalHandle, + pub(crate) snapshot_lsn: u64, + pub(crate) use_persistent_pk_index: bool, +} + +struct ValidatedSimpleRowIdProjectionRequest<'a> { + table_schema: &'a TableSchema, + projection_indexes: &'a [usize], + column_names: Vec, + lookup_row_id: i64, + pager: &'a PagerHandle, + wal: &'a WalHandle, + snapshot_lsn: u64, + use_persistent_pk_index: bool, +} + impl EngineRuntime { #[must_use] pub(crate) fn empty(schema_cookie: u32) -> Self { @@ -11189,8 +11238,6 @@ impl EngineRuntime { { return Ok(None); } - let canonical_table_name = table_schema.name.as_str(); - let mut projection_indexes = Vec::with_capacity(request.projection_columns.len()); let mut column_names = Vec::with_capacity(request.projection_columns.len()); for projection_column in request.projection_columns { @@ -11205,17 +11252,75 @@ impl EngineRuntime { column_names.push((*projection_column).to_string()); } + self.execute_validated_simple_row_id_projection_at_snapshot( + ValidatedSimpleRowIdProjectionRequest { + table_schema, + projection_indexes: &projection_indexes, + column_names, + lookup_row_id: request.lookup_row_id, + pager: request.pager, + wal: request.wal, + snapshot_lsn: request.snapshot_lsn, + use_persistent_pk_index: request.use_persistent_pk_index, + }, + ) + } + + pub(crate) fn execute_resolved_simple_row_id_projection_at_snapshot( + &self, + request: ResolvedSimpleRowIdProjectionRequest<'_>, + ) -> Result> { + if self + .visible_view(request.table_name, NameResolutionScope::Session) + .is_some() + || self.visible_table_is_temporary(request.table_name) + { + return Ok(None); + } + let Some(table_schema) = self.table_schema(request.table_name) else { + return Ok(None); + }; + if !generated_columns_are_stored(table_schema) { + return Ok(None); + } + if request + .projection_indexes + .iter() + .any(|index| *index >= table_schema.columns.len()) + { + return Ok(None); + } + self.execute_validated_simple_row_id_projection_at_snapshot( + ValidatedSimpleRowIdProjectionRequest { + table_schema, + projection_indexes: request.projection_indexes, + column_names: request.column_names.to_vec(), + lookup_row_id: request.lookup_row_id, + pager: request.pager, + wal: request.wal, + snapshot_lsn: request.snapshot_lsn, + use_persistent_pk_index: request.use_persistent_pk_index, + }, + ) + } + + fn execute_validated_simple_row_id_projection_at_snapshot( + &self, + request: ValidatedSimpleRowIdProjectionRequest<'_>, + ) -> Result> { + let table_schema = request.table_schema; + let canonical_table_name = table_schema.name.as_str(); if let Some(row_source) = self.visible_table_row_source(canonical_table_name) { let rows = row_source .row_by_id(request.lookup_row_id)? .map(|stored_row| { vec![project_simple_projection_values( stored_row.values(), - &projection_indexes, + request.projection_indexes, )] }) .unwrap_or_default(); - return Ok(Some(QueryResult::with_rows(column_names, rows))); + return Ok(Some(QueryResult::with_rows(request.column_names, rows))); } if !self.has_deferred_tables() @@ -11247,13 +11352,13 @@ impl EngineRuntime { paged_locator_cache, )? .map(|stored_row| { - vec![project_simple_projection_row( - &stored_row, - &projection_indexes, + vec![project_simple_projection_owned_row( + stored_row, + request.projection_indexes, )] }) .unwrap_or_default(); - Ok(Some(QueryResult::with_rows(column_names, rows))) + Ok(Some(QueryResult::with_rows(request.column_names, rows))) } fn try_execute_simple_deferred_rowid_join_projection_query( @@ -22966,6 +23071,22 @@ fn project_simple_projection_row(stored_row: &StoredRow, projection_indexes: &[u project_simple_projection_values(&stored_row.values, projection_indexes) } +fn project_simple_projection_owned_row( + stored_row: StoredRow, + projection_indexes: &[usize], +) -> QueryRow { + if projection_indexes.len() == stored_row.values.len() + && projection_indexes + .iter() + .copied() + .enumerate() + .all(|(expected, actual)| expected == actual) + { + return QueryRow::new(stored_row.values); + } + project_simple_projection_values(&stored_row.values, projection_indexes) +} + fn project_simple_projection_values(values: &[Value], projection_indexes: &[usize]) -> QueryRow { let mut projected = Vec::with_capacity(projection_indexes.len()); for index in projection_indexes { @@ -23424,7 +23545,7 @@ fn append_paged_row_locator_entries( } fn append_cached_paged_row_locators( - locators: &mut BTreeMap, + locators: &mut Int64Map, payload: &[u8], pointer: OverflowPointer, checksum: u32, @@ -23464,12 +23585,54 @@ fn append_cached_paged_row_locators( Ok(()) } +fn maybe_cache_verified_paged_chunk_payload( + payloads: &mut HashMap>>, + cached_payload_bytes: &mut usize, + pointer: OverflowPointer, + checksum: u32, + payload: &Arc>, +) { + let payload_len = payload.len(); + if payload_len == 0 || payload_len > DEFERRED_PAGED_ROW_PAYLOAD_CACHE_LIMIT_BYTES { + return; + } + if cached_payload_bytes.saturating_add(payload_len) + > DEFERRED_PAGED_ROW_PAYLOAD_CACHE_LIMIT_BYTES + { + return; + } + let key = CachedPagedChunkPayloadKey::new(pointer, checksum); + if payloads.contains_key(&key) { + return; + } + *cached_payload_bytes = cached_payload_bytes.saturating_add(payload_len); + payloads.insert(key, Arc::clone(payload)); +} + fn build_deferred_paged_row_locator_cache( state: PersistedTableState, chunks: &[TablePageManifestChunk], ) -> Result { - let mut locators = BTreeMap::new(); + let expected_locators = chunks + .iter() + .map(|chunk| { + chunk + .row_count + .saturating_add(chunk.overlay_payload.is_some() as usize) + }) + .sum(); + let mut locators = + Int64Map::with_capacity_and_hasher(expected_locators, Int64HashBuilder::default()); + let mut verified_payloads = HashMap::new(); + let mut cached_payload_bytes = 0usize; for chunk in chunks { + maybe_cache_verified_paged_chunk_payload( + &mut verified_payloads, + &mut cached_payload_bytes, + chunk.pointer, + chunk.checksum, + &chunk.payload, + ); append_cached_paged_row_locators( &mut locators, chunk.payload.as_slice(), @@ -23482,6 +23645,13 @@ fn build_deferred_paged_row_locator_cache( chunk.overlay_checksum, chunk.overlay_payload.as_ref(), ) { + maybe_cache_verified_paged_chunk_payload( + &mut verified_payloads, + &mut cached_payload_bytes, + overlay_pointer, + overlay_checksum, + overlay_payload, + ); append_cached_paged_row_locators( &mut locators, overlay_payload.as_slice(), @@ -23495,6 +23665,7 @@ fn build_deferred_paged_row_locator_cache( manifest_pointer: state.pointer, manifest_checksum: state.checksum, locators, + verified_payloads, }) } @@ -23729,21 +23900,19 @@ fn read_deferred_row_by_cached_paged_locator( store: &S, row_id: i64, cached: CachedPagedRowLocator, + verified_payload: Option<&[u8]>, ) -> Result> { - let payload = read_overflow(store, cached.pointer)?; - if crc32c_parts(&[payload.as_slice()]) != cached.checksum { - return Err(DbError::corruption("paged table chunk checksum mismatch")); - } - let start = cached.locator.byte_offset as usize; - let end = start.saturating_add(cached.locator.byte_len as usize); - let row_bytes = payload - .get(start..end) - .ok_or_else(|| DbError::corruption("paged row locator exceeded payload length"))?; - let row = Row::decode(row_bytes)?; - Ok(Some(StoredRow { - row_id, - values: row.into_values(), - })) + let owned_payload; + let payload = if let Some(payload) = verified_payload { + payload + } else { + owned_payload = read_overflow(store, cached.pointer)?; + if crc32c_parts(&[owned_payload.as_slice()]) != cached.checksum { + return Err(DbError::corruption("paged table chunk checksum mismatch")); + } + owned_payload.as_slice() + }; + decode_row_by_locator_from_payload(payload, row_id, cached.locator).map(Some) } fn read_deferred_row_by_id_from_paged_chunk( @@ -23901,7 +24070,14 @@ fn read_deferred_stored_row_by_id( .locators .get(&row_id) .copied() - .map(|cached| read_deferred_row_by_cached_paged_locator(store, row_id, cached)) + .map(|cached| { + read_deferred_row_by_cached_paged_locator( + store, + row_id, + cached, + cache.verified_payload(cached.pointer, cached.checksum), + ) + }) .unwrap_or(Ok(None)); } return match locator { diff --git a/design/METRIC_IMPROVEMENTS_PLAN.md b/design/METRIC_IMPROVEMENTS_PLAN.md index 3b94dc3..048a0c9 100644 --- a/design/METRIC_IMPROVEMENTS_PLAN.md +++ b/design/METRIC_IMPROVEMENTS_PLAN.md @@ -107,11 +107,80 @@ Important rust-baseline wins to protect: | full | 1.88x | 232.10x | 4.96x | 15.54x | 2.77x | | huge | 1.15x | 1504.83x | 3.80x | 12.77x | 2.93x | +## 2026-06-11 Worktree Update: Point Lookup + +Implemented worktree optimizations for the point-lookup priority: + +- Cache prepared row-id projection metadata so prepared point reads do not + resolve projection columns on every execution. +- Add an `execute_with_params` single-statement fast path before batch splitting + for simple `SELECT ... WHERE rowid_alias = $n` and `COUNT(*)` queries. +- Use the existing identity-hashed `Int64Map` for deferred paged row locators. +- Retain a bounded 8 MiB per-table cache of already verified paged chunk + payloads in `DeferredPagedRowLocatorCache`, avoiding repeated overflow reads + and CRC checks for small dimension tables. +- Avoid a lowercased SQL allocation in the simple row-id parser. +- Avoid common-path security-table allocation and duplicate runtime read locks. +- Move decoded owned rows directly into `QueryRow` for identity projections. +- Split validated and resolved simple row-id execution so the unprepared fast + path does not repeat table/view/temp/generated-column validation. + +Validation run: + +- `cargo fmt --check` +- `cargo check -p decentdb` +- `cargo clippy -p decentdb --all-targets --all-features -- -D warnings` +- `cargo test -p decentdb prepared_row_id_point_lookup_keeps_deferred_table_unloaded -- --nocapture` +- `cargo test -p decentdb prepared_row_id_range_uses_deferred_locator_cache -- --nocapture` +- `cargo test -p decentdb fast_path -- --nocapture` + +Rust-baseline all-scale comparison from +`.tmp/rust-baseline-point-lookup-20260611-final/results`: + +| Scale | Original DecentDB `query_artist_by_id` | Current DecentDB | Current SQLite | SQLite / DecentDB | DecentDB change vs original | Status | +|---|---:|---:|---:|---:|---:|---| +| smoke | 48.66 us | 23.98 us | 22.87 us | 0.95x | -50.7% | Large improvement; not a clean SQLite win | +| medium | 73.21 us | 35.33 us | 42.77 us | 1.21x | -51.7% | DecentDB wins this run | +| full | 89.72 us | 39.34 us | 61.94 us | 1.57x | -56.1% | DecentDB wins this run | +| huge | 87.37 us | 41.16 us | 90.31 us | 2.19x | -52.9% | DecentDB wins this run | + +Smoke repeat check from +`.tmp/rust-baseline-point-lookup-20260611-smoke-repeats-v6`: + +| Engine | Runs | Median | Q1 | Q3 | Interpretation | +|---|---:|---:|---:|---:|---| +| DecentDB | 24 | 23.83 us | 22.87 us | 26.08 us | Much improved but still behind SQLite median | +| SQLite | 24 | 20.58 us | 19.46 us | 22.53 us | Still faster on tiny fixed-overhead smoke lookup | + +Current point-lookup status: + +- The original rust-baseline DecentDB point lookup was improved by roughly + 51-56% on medium/full and 53% on huge, now beating SQLite by 1.21x, 1.57x, + and 2.19x respectively in the latest all-scale run. +- Smoke improved by roughly 51%, but median smoke still trails SQLite by about + 14% in repeated runs. The remaining gap is fixed per-query overhead rather + than row retrieval from storage. +- The public `embedded_compare` chart benchmark has not been rerun for this + worktree update yet. Because the prepared path was optimized, `read_p95_ms` + should be rerun before marking priority 1 complete. + +Next point-lookup follow-ups: + +- Profile the remaining fixed overhead in `begin_reader_with_pager`, + `refresh_engine_from_snapshot`, security rule checks, and result construction + before attempting a more invasive change. +- Add selective row decoding for partial projections such as public + `SELECT name FROM users WHERE id = $1`; this should help public + `read_p95_ms` more than rust-baseline `query_artist_by_id`, which projects + every `artists` column. +- Rerun `cargo bench -p decentdb --bench embedded_compare` and update the + public README metric table before declaring point lookup complete. + ## Recommended Priority Order | Rank | Priority metric / area | Public chart coverage | Rust-baseline coverage | Baseline status | Target | |---:|---|---|---|---|---| -| 1 | Point lookup latency | `read_p95_ms` | `query_artist_by_id` | Tuned public row wins, balanced loses, rust-baseline still slightly loses at all scales | Improve core point lookup latency; keep tuned ahead and close rust-baseline gap | +| 1 | Point lookup latency | `read_p95_ms` | `query_artist_by_id` | Worktree now wins rust-baseline medium/full/huge and cuts smoke roughly in half, but smoke median still trails SQLite; public chart rerun pending | Finish fixed-overhead work, rerun public benchmark, keep tuned ahead and close smoke gap | | 2 | Range scan latency | `range_scan_p95_ms` | Partial overlap through indexed scans and view paths | Tuned public row is 0.89x vs SQLite | Bring tuned above 1.00x vs SQLite and reduce balanced gap | | 3 | Join and view lookup latency | `join_p95_ms` | `query_view_first_1000`, `query_songs_for_artist_via_view` | Tuned public row is 0.96x; rust-baseline view paths lose strongly | Bring public join above 1.00x and reduce view-path latency materially | | 4 | Durable commit latency | `commit_p95_ms` | Not directly represented in rust-baseline totals | Tuned public row wins narrowly at 1.06x | Protect or improve without weakening ACID guarantees | From 0239e2c480e20d49c7a76d6b906926c46cc63240 Mon Sep 17 00:00:00 2001 From: Steven Hildreth Date: Fri, 12 Jun 2026 08:23:56 -0500 Subject: [PATCH 6/9] Refactor QueryResult to use Arc for columns, enhance row projection decoding, and improve FaultyVfsFile write handling - Changed QueryResult's columns from Vec to Arc<[String]> for better memory efficiency. - Added a new method `with_shared_columns` to create QueryResult with shared columns. - Implemented `decode_projection_with_overflow` in Row to handle projections with overflow support. - Enhanced `decode_value_with_overflow` to decode various value types with overflow handling. - Added tests for row projection decoding to ensure correct materialization of requested columns. - Introduced batched write handling in FaultyVfsFile to honor failpoints and improve error handling. - Updated StatsVfsFile to track write statistics for batched writes. - Added a method in WalIndex to check for dirty pages since demotion. - Modified commit logic in WAL writer to conditionally demote cold versions based on dirty pages. - Updated benchmark summary and metrics documentation to reflect recent performance improvements. --- benchmarks/rust-baseline/README.md | 7 + crates/decentdb/src/c_api.rs | 94 + crates/decentdb/src/db.rs | 1850 ++++++++++++++++- crates/decentdb/src/exec/mod.rs | 1173 ++++++++++- crates/decentdb/src/exec/row.rs | 21 +- crates/decentdb/src/record/row.rs | 235 +++ crates/decentdb/src/vfs/faulty.rs | 79 + crates/decentdb/src/vfs/stats.rs | 18 + crates/decentdb/src/wal/index.rs | 4 + crates/decentdb/src/wal/writer.rs | 14 +- .../unit_wal_recovery_edge_cases_tests.rs | 9 +- data/bench_summary.json | 215 +- design/METRIC_IMPROVEMENTS_PLAN.md | 58 +- 13 files changed, 3441 insertions(+), 336 deletions(-) diff --git a/benchmarks/rust-baseline/README.md b/benchmarks/rust-baseline/README.md index 0fb8f0a..64bfe2d 100644 --- a/benchmarks/rust-baseline/README.md +++ b/benchmarks/rust-baseline/README.md @@ -8,6 +8,13 @@ seed plan, and query shapes through `rusqlite`. The SQLite path exists only in this benchmark crate. It does not add SQLite tests, dependencies, or comparison behavior to the DecentDB engine core. +For the current cross-benchmark performance plan, see +`../../design/METRIC_IMPROVEMENTS_PLAN.md`. The public README charts are driven +by `cargo bench -p decentdb --bench embedded_compare` and +`data/bench_summary.json`; this rust-baseline runner is the larger diagnostic +surface for music-library totals, point lookups, joins, views, and grouped +aggregates. + The default DecentDB path links the `decentdb` crate directly (path-dep against `../../crates/decentdb`) and uses the engine's hot-path API: diff --git a/crates/decentdb/src/c_api.rs b/crates/decentdb/src/c_api.rs index df58ad5..03fdcbc 100644 --- a/crates/decentdb/src/c_api.rs +++ b/crates/decentdb/src/c_api.rs @@ -5346,6 +5346,100 @@ mod tests { let _ = std::fs::remove_file(path_buf.with_extension("ddb.shm")); } + #[test] + fn ffi_reader_prepared_statement_refreshes_after_external_checkpoint() { + let unique = format!( + "ffi-checkpoint-reader-refresh-{}-{}.ddb", + std::process::id(), + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .expect("clock") + .as_nanos() + ); + let path_buf = std::env::temp_dir().join(unique); + let path = CString::new(path_buf.to_string_lossy().as_bytes()).expect("path"); + let mut setup = ptr::null_mut(); + let mut reader = ptr::null_mut(); + let mut writer = ptr::null_mut(); + let mut result = ptr::null_mut(); + + assert_eq!(ddb_db_open_or_create(path.as_ptr(), &mut setup), DDB_OK); + let create = CString::new("CREATE TABLE t (id INTEGER)").expect("create"); + assert_eq!( + ddb_db_execute(setup, create.as_ptr(), ptr::null(), 0, &mut result), + DDB_OK + ); + assert_eq!(ddb_result_free(&mut result), DDB_OK); + assert_eq!(ddb_db_begin_transaction(setup), DDB_OK); + for id in 0_i64..100_i64 { + let insert = CString::new(format!("INSERT INTO t VALUES ({id})")).expect("insert"); + assert_eq!( + ddb_db_execute(setup, insert.as_ptr(), ptr::null(), 0, &mut result), + DDB_OK + ); + assert_eq!(ddb_result_free(&mut result), DDB_OK); + } + let mut lsn = 0; + assert_eq!(ddb_db_commit_transaction(setup, &mut lsn), DDB_OK); + assert_eq!(ddb_db_free(&mut setup), DDB_OK); + + assert_eq!(ddb_db_open_or_create(path.as_ptr(), &mut reader), DDB_OK); + let select_all = CString::new("SELECT * FROM t").expect("select all"); + let mut select_stmt = ptr::null_mut(); + assert_eq!( + ddb_db_prepare(reader, select_all.as_ptr(), &mut select_stmt), + DDB_OK + ); + let mut seen_rows = 0; + loop { + let mut has_row = 0; + assert_eq!(ddb_stmt_step(select_stmt, &mut has_row), DDB_OK); + if has_row == 0 { + break; + } + seen_rows += 1; + } + assert_eq!(seen_rows, 100); + assert_eq!(ddb_stmt_free(&mut select_stmt), DDB_OK); + + assert_eq!(ddb_db_open_or_create(path.as_ptr(), &mut writer), DDB_OK); + assert_eq!(ddb_db_begin_transaction(writer), DDB_OK); + for id in 100_i64..200_i64 { + let insert = CString::new(format!("INSERT INTO t VALUES ({id})")).expect("insert"); + assert_eq!( + ddb_db_execute(writer, insert.as_ptr(), ptr::null(), 0, &mut result), + DDB_OK + ); + assert_eq!(ddb_result_free(&mut result), DDB_OK); + } + assert_eq!(ddb_db_commit_transaction(writer, &mut lsn), DDB_OK); + assert_eq!(ddb_db_checkpoint(writer), DDB_OK); + + let count = CString::new("SELECT COUNT(*) FROM t").expect("count"); + let mut count_stmt = ptr::null_mut(); + assert_eq!( + ddb_db_prepare(reader, count.as_ptr(), &mut count_stmt), + DDB_OK + ); + let mut has_row = 0; + assert_eq!(ddb_stmt_step(count_stmt, &mut has_row), DDB_OK); + assert_eq!(has_row, 1); + let mut value = DdbValue::default(); + assert_eq!(ddb_stmt_value_copy(count_stmt, 0, &mut value), DDB_OK); + assert_eq!(value.int64_value, 200); + assert_eq!(ddb_value_dispose(&mut value), DDB_OK); + + assert_eq!(ddb_stmt_free(&mut count_stmt), DDB_OK); + assert_eq!(ddb_db_free(&mut writer), DDB_OK); + assert_eq!(ddb_db_free(&mut reader), DDB_OK); + + let _ = std::fs::remove_file(&path_buf); + let mut wal_path = path_buf.clone(); + wal_path.as_mut_os_string().push(".wal"); + let _ = std::fs::remove_file(wal_path); + let _ = std::fs::remove_file(path_buf.with_extension("ddb.shm")); + } + #[test] fn ffi_reused_prepared_stmt_across_checkpoint_keeps_explicit_txn_commit_working() { let unique = format!( diff --git a/crates/decentdb/src/db.rs b/crates/decentdb/src/db.rs index 54779a8..bb23bee 100644 --- a/crates/decentdb/src/db.rs +++ b/crates/decentdb/src/db.rs @@ -27,8 +27,9 @@ use crate::exec::dml::{ use crate::exec::{ decode_paged_table_manifest_payload, read_table_payload_row_count_from_bytes, row_satisfies_expression, statement_is_read_only, BulkLoadOptions, EngineRuntime, QueryResult, - QueryRow, ResolvedSimpleRowIdProjectionRequest, RuntimeIndex, SimpleRowIdProjectionRequest, - TableData, + QueryRow, ResolvedSimpleJoinProjection, ResolvedSimpleRowIdJoinProjectionRequest, + ResolvedSimpleRowIdProjectionRequest, ResolvedSimpleRowIdRangeProjectionRequest, RuntimeIndex, + SimpleJoinProjectionSide, SimpleRangeBoundValue, SimpleRowIdProjectionRequest, TableData, }; use crate::metadata::{ CheckConstraintInfo, ColumnInfo, ForeignKeyInfo, HeaderInfo, IndexInfo, IndexVerification, @@ -181,6 +182,9 @@ pub struct PreparedStatement { statement: Arc, prepared_sql: String, simple_row_id_projection: Option, + simple_row_id_range_projection: Option, + simple_row_id_join_projection: Option, + simple_scalar_filtered_aggregate: Option, prepared_insert: Option>, prepared_update: Option>, prepared_delete: Option>, @@ -191,10 +195,84 @@ pub struct PreparedStatement { struct PreparedSimpleRowIdProjection { table_name: String, projection_indexes: Vec, - column_names: Vec, + column_names: Arc<[String]>, param_index: usize, } +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct PreparedSimpleRangeBoundParam { + inclusive: bool, + param_index: usize, +} + +#[derive(Clone, Debug)] +struct PreparedSimpleRowIdRangeProjection { + table_name: String, + projection_indexes: Vec, + column_names: Arc<[String]>, + filter_column: String, + lower_bound: Option, + upper_bound: Option, + limit_param_index: usize, +} + +#[derive(Clone, Debug)] +struct PreparedSimpleRowIdJoinProjection { + left_table_name: String, + right_table_name: String, + left_projection_indexes: Vec, + right_projection_indexes: Vec, + projections: Vec, + column_names: Arc<[String]>, + param_index: usize, +} + +#[derive(Clone, Debug)] +struct PreparedSimpleScalarFilteredAggregate { + table_name: String, + param_index: usize, + cache: Arc>, +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +struct PreparedScalarAggregateCacheKey { + snapshot_lsn: u64, + pointer_head_page_id: u32, + pointer_logical_len: u32, + pointer_flags: u8, + checksum: u32, + row_count: usize, + param_value: i64, +} + +#[derive(Debug, Default)] +struct PreparedScalarAggregateCache { + entries: HashMap, + insertion_order: VecDeque, +} + +const PREPARED_SCALAR_AGGREGATE_CACHE_LIMIT: usize = 256; + +impl PreparedScalarAggregateCache { + fn get(&self, key: &PreparedScalarAggregateCacheKey) -> Option { + self.entries.get(key).cloned() + } + + fn insert(&mut self, key: PreparedScalarAggregateCacheKey, result: QueryResult) { + if let Some(existing) = self.entries.get_mut(&key) { + *existing = result; + return; + } + if self.entries.len() >= PREPARED_SCALAR_AGGREGATE_CACHE_LIMIT { + if let Some(evicted) = self.insertion_order.pop_front() { + self.entries.remove(&evicted); + } + } + self.insertion_order.push_back(key); + self.entries.insert(key, result); + } +} + /// Transaction-scoped prepared statement executor for repeated rows. /// /// This handle validates the prepared statement and resolves the insert fast @@ -2197,6 +2275,9 @@ impl Db { started_at_unix_ms: i64, result: std::result::Result<&QueryResult, &DbError>, ) { + if !self.inner.tracing.any_enabled() { + return; + } let status = match result { Ok(_) => "ok", Err(_) => "error", @@ -4341,7 +4422,122 @@ impl Db { ResolvedSimpleRowIdProjectionRequest { table_name: plan.table_name.as_str(), projection_indexes: &plan.projection_indexes, - column_names: &plan.column_names, + column_names: Arc::clone(&plan.column_names), + lookup_row_id: *lookup_row_id, + pager: &self.inner.pager, + wal: &self.inner.wal, + snapshot_lsn, + use_persistent_pk_index: self.inner.config.persistent_pk_index, + }, + )?; + drop(runtime); + drop(reader); + Ok(result) + } + + fn try_execute_prepared_simple_row_id_range_projection( + &self, + prepared: &PreparedStatement, + params: &[Value], + ) -> Result> { + if self.inner.sql_txn_active.load(Ordering::Acquire) { + return Ok(None); + } + let Some(plan) = prepared.simple_row_id_range_projection.as_ref() else { + return Ok(None); + }; + let lower_bound = if let Some(bound) = plan.lower_bound { + let Some(Value::Int64(value)) = params.get(bound.param_index) else { + return Ok(None); + }; + Some(SimpleRangeBoundValue { + inclusive: bound.inclusive, + value: Value::Int64(*value), + }) + } else { + None + }; + let upper_bound = if let Some(bound) = plan.upper_bound { + let Some(Value::Int64(value)) = params.get(bound.param_index) else { + return Ok(None); + }; + Some(SimpleRangeBoundValue { + inclusive: bound.inclusive, + value: Value::Int64(*value), + }) + } else { + None + }; + let Some(Value::Int64(limit_value)) = params.get(plan.limit_param_index) else { + return Ok(None); + }; + let limit = Some(usize::try_from((*limit_value).max(0)).unwrap_or(usize::MAX)); + + let reader = self.inner.wal.begin_reader_with_pager(&self.inner.pager)?; + let snapshot_lsn = reader.snapshot_lsn(); + self.refresh_engine_from_snapshot(snapshot_lsn)?; + let Some(runtime) = self.runtime_read_for_fast_read_at_snapshot(snapshot_lsn)? else { + return Ok(None); + }; + self.validate_prepared_schema_cookie( + prepared, + runtime.catalog.schema_cookie, + runtime.temp_schema_cookie, + )?; + let result = runtime.execute_resolved_simple_row_id_range_projection_at_snapshot( + ResolvedSimpleRowIdRangeProjectionRequest { + table_name: plan.table_name.as_str(), + projection_indexes: &plan.projection_indexes, + column_names: Arc::clone(&plan.column_names), + filter_column: plan.filter_column.as_str(), + lower_bound, + upper_bound, + limit, + pager: &self.inner.pager, + wal: &self.inner.wal, + snapshot_lsn, + use_persistent_pk_index: self.inner.config.persistent_pk_index, + }, + )?; + drop(runtime); + drop(reader); + Ok(result) + } + + fn try_execute_prepared_simple_row_id_join_projection( + &self, + prepared: &PreparedStatement, + params: &[Value], + ) -> Result> { + if self.inner.sql_txn_active.load(Ordering::Acquire) { + return Ok(None); + } + let Some(plan) = prepared.simple_row_id_join_projection.as_ref() else { + return Ok(None); + }; + let Some(Value::Int64(lookup_row_id)) = params.get(plan.param_index) else { + return Ok(None); + }; + + let reader = self.inner.wal.begin_reader_with_pager(&self.inner.pager)?; + let snapshot_lsn = reader.snapshot_lsn(); + self.refresh_engine_from_snapshot(snapshot_lsn)?; + let Some(runtime) = self.runtime_read_for_fast_read_at_snapshot(snapshot_lsn)? else { + return Ok(None); + }; + self.validate_prepared_schema_cookie( + prepared, + runtime.catalog.schema_cookie, + runtime.temp_schema_cookie, + )?; + let result = runtime.execute_resolved_simple_row_id_join_projection_at_snapshot( + ResolvedSimpleRowIdJoinProjectionRequest { + left_table_name: plan.left_table_name.as_str(), + right_table_name: plan.right_table_name.as_str(), + left_projection_indexes: &plan.left_projection_indexes, + right_projection_indexes: &plan.right_projection_indexes, + projections: &plan.projections, + column_names: Arc::clone(&plan.column_names), lookup_row_id: *lookup_row_id, pager: &self.inner.pager, wal: &self.inner.wal, @@ -4354,6 +4550,86 @@ impl Db { Ok(result) } + fn try_execute_prepared_simple_scalar_filtered_aggregate( + &self, + prepared: &PreparedStatement, + params: &[Value], + ) -> Result> { + if self.inner.sql_txn_active.load(Ordering::Acquire) { + return Ok(None); + } + let Some(plan) = prepared.simple_scalar_filtered_aggregate.as_ref() else { + return Ok(None); + }; + let Some(Value::Int64(param_value)) = params.get(plan.param_index) else { + return Ok(None); + }; + let SqlStatement::Query(query) = prepared.statement.as_ref() else { + return Ok(None); + }; + + let reader = self.inner.wal.begin_reader_with_pager(&self.inner.pager)?; + let snapshot_lsn = reader.snapshot_lsn(); + self.refresh_engine_from_snapshot(snapshot_lsn)?; + let Some(runtime) = self.runtime_read_for_fast_read_at_snapshot(snapshot_lsn)? else { + return Ok(None); + }; + self.validate_prepared_schema_cookie( + prepared, + runtime.catalog.schema_cookie, + runtime.temp_schema_cookie, + )?; + let has_resident_source = runtime.table_row_source(plan.table_name.as_str()).is_some(); + if !has_resident_source && !runtime.has_deferred_tables() { + return Ok(None); + } + let state = runtime.persisted_table_state(plan.table_name.as_str()); + if !has_resident_source && state.is_none() { + return Ok(None); + }; + let state = state.unwrap_or_default(); + let key = PreparedScalarAggregateCacheKey { + snapshot_lsn, + pointer_head_page_id: state.pointer.head_page_id, + pointer_logical_len: state.pointer.logical_len, + pointer_flags: state.pointer.flags, + checksum: state.checksum, + row_count: state.row_count, + param_value: *param_value, + }; + if let Some(result) = plan + .cache + .lock() + .map_err(|_| DbError::internal("prepared aggregate cache lock poisoned"))? + .get(&key) + { + drop(runtime); + drop(reader); + return Ok(Some(result)); + } + + let result = if has_resident_source { + runtime.try_execute_simple_grouped_numeric_aggregate_query(query, params)? + } else { + runtime.try_execute_simple_deferred_paged_grouped_numeric_aggregate_query( + query, + params, + &self.inner.pager, + &self.inner.wal, + snapshot_lsn, + )? + }; + if let Some(result) = result.as_ref() { + plan.cache + .lock() + .map_err(|_| DbError::internal("prepared aggregate cache lock poisoned"))? + .insert(key, result.clone()); + } + drop(runtime); + drop(reader); + Ok(result) + } + fn execute_prepared_read_statement( &self, prepared: &PreparedStatement, @@ -4364,6 +4640,21 @@ impl Db { { return Ok(result); } + if let Some(result) = + self.try_execute_prepared_simple_row_id_range_projection(prepared, params)? + { + return Ok(result); + } + if let Some(result) = + self.try_execute_prepared_simple_row_id_join_projection(prepared, params)? + { + return Ok(result); + } + if let Some(result) = + self.try_execute_prepared_simple_scalar_filtered_aggregate(prepared, params)? + { + return Ok(result); + } { let runtime = self .inner @@ -4935,7 +5226,9 @@ impl Db { return Err(error); } }; - self.sync_post_commit(&mut runtime, committed_lsn)?; + if !runtime.sync_mutations.is_empty() { + self.sync_post_commit(&mut runtime, committed_lsn)?; + } self.sync_temp_state_from_runtime(&runtime)?; self.inner .last_runtime_lsn @@ -5017,7 +5310,9 @@ impl Db { return Err(error); } }; - self.sync_post_commit(&mut runtime, committed_lsn)?; + if !runtime.sync_mutations.is_empty() { + self.sync_post_commit(&mut runtime, committed_lsn)?; + } self.sync_temp_state_from_runtime(&runtime)?; self.inner .last_runtime_lsn @@ -5101,7 +5396,9 @@ impl Db { return Err(error); } }; - self.sync_post_commit(&mut runtime, committed_lsn)?; + if !runtime.sync_mutations.is_empty() { + self.sync_post_commit(&mut runtime, committed_lsn)?; + } self.sync_temp_state_from_runtime(&runtime)?; self.inner .last_runtime_lsn @@ -5109,9 +5406,13 @@ impl Db { self.inner .writer_last_commit_lsn .store(committed_lsn, Ordering::Release); + let redefer_after_write = + self.runtime_should_redefer_persisted_tables_after_write(&runtime, table_refs); drop(runtime); self.publish_reactive_commit(reactive_pending, committed_lsn); - self.redefer_persisted_tables_after_write(table_refs)?; + if redefer_after_write { + self.redefer_persisted_tables_after_write(table_refs)?; + } Ok(Some(QueryResult::with_affected_rows(affected))) } @@ -5165,7 +5466,9 @@ impl Db { return Err(error); } }; - self.sync_post_commit(&mut runtime, committed_lsn)?; + if !runtime.sync_mutations.is_empty() { + self.sync_post_commit(&mut runtime, committed_lsn)?; + } let runtime_schema_cookie = runtime.catalog.schema_cookie; if self.inner.catalog.schema_cookie()? != runtime_schema_cookie { self.inner @@ -5237,7 +5540,9 @@ impl Db { return Err(error); } }; - self.sync_post_commit(&mut runtime, committed_lsn)?; + if !runtime.sync_mutations.is_empty() { + self.sync_post_commit(&mut runtime, committed_lsn)?; + } let runtime_schema_cookie = runtime.catalog.schema_cookie; if self.inner.catalog.schema_cookie()? != runtime_schema_cookie { self.inner @@ -5658,6 +5963,12 @@ impl Db { }; let simple_row_id_projection = Self::prepared_simple_row_id_projection(&prepared_sql, runtime); + let simple_row_id_range_projection = + Self::prepared_simple_row_id_range_projection(&prepared_sql, runtime); + let simple_row_id_join_projection = + Self::prepared_simple_row_id_join_projection(statement.as_ref(), runtime); + let simple_scalar_filtered_aggregate = + Self::prepared_simple_scalar_filtered_aggregate(statement.as_ref(), runtime); Ok(PreparedStatement { db: self.clone(), schema_cookie: runtime.catalog.schema_cookie, @@ -5665,6 +5976,9 @@ impl Db { statement: Arc::clone(&statement), prepared_sql: prepared_sql.clone(), simple_row_id_projection, + simple_row_id_range_projection, + simple_row_id_join_projection, + simple_scalar_filtered_aggregate, prepared_insert, prepared_update, prepared_delete, @@ -5675,40 +5989,320 @@ impl Db { fn prepared_simple_row_id_projection( sql: &str, runtime: &EngineRuntime, - ) -> Option { - let plan = parse_simple_row_id_projection_sql(sql)?; - if runtime.temp_table_schema(plan.table_name).is_some() + ) -> Option { + let plan = parse_simple_row_id_projection_sql(sql)?; + if runtime.temp_table_schema(plan.table_name).is_some() + || runtime + .catalog + .views + .keys() + .any(|view_name| identifiers_equal(view_name, plan.table_name)) + { + return None; + } + let table = runtime.catalog.table(plan.table_name)?; + if !row_id_alias_column_name(table) + .is_some_and(|column_name| identifiers_equal(column_name, plan.filter_column)) + { + return None; + } + + let mut projection_indexes = Vec::with_capacity(plan.projection_columns.len()); + let mut column_names = Vec::with_capacity(plan.projection_columns.len()); + for projection_column in plan.projection_columns { + let index = table + .columns + .iter() + .position(|column| identifiers_equal(&column.name, projection_column))?; + projection_indexes.push(index); + column_names.push(projection_column.to_string()); + } + + Some(PreparedSimpleRowIdProjection { + table_name: table.name.clone(), + projection_indexes, + column_names: Arc::from(column_names), + param_index: plan.param_index, + }) + } + + fn prepared_simple_row_id_range_projection( + sql: &str, + runtime: &EngineRuntime, + ) -> Option { + let plan = parse_simple_row_id_range_projection_sql(sql)?; + if runtime.temp_table_schema(plan.table_name).is_some() + || runtime + .catalog + .views + .keys() + .any(|view_name| identifiers_equal(view_name, plan.table_name)) + { + return None; + } + let table = runtime.catalog.table(plan.table_name)?; + let filter_column_index = table + .columns + .iter() + .position(|column| identifiers_equal(&column.name, plan.filter_column))?; + if !table + .primary_key_columns + .iter() + .any(|column| identifiers_equal(column, plan.filter_column)) + || table.columns[filter_column_index].column_type != ColumnType::Int64 + { + return None; + } + + let mut projection_indexes = Vec::with_capacity(plan.projection_columns.len()); + let mut column_names = Vec::with_capacity(plan.projection_columns.len()); + for projection_column in plan.projection_columns { + let index = table + .columns + .iter() + .position(|column| identifiers_equal(&column.name, projection_column))?; + projection_indexes.push(index); + column_names.push(projection_column.to_string()); + } + + Some(PreparedSimpleRowIdRangeProjection { + table_name: table.name.clone(), + projection_indexes, + column_names: Arc::from(column_names), + filter_column: table.columns[filter_column_index].name.clone(), + lower_bound: plan.lower_bound, + upper_bound: plan.upper_bound, + limit_param_index: plan.limit_param_index, + }) + } + + fn prepared_simple_row_id_join_projection( + statement: &SqlStatement, + runtime: &EngineRuntime, + ) -> Option { + let SqlStatement::Query(query) = statement else { + return None; + }; + if !query.ctes.is_empty() + || !query.order_by.is_empty() + || query.limit.is_some() + || query.offset.is_some() + { + return None; + } + let crate::sql::ast::QueryBody::Select(select) = &query.body else { + return None; + }; + if !select.group_by.is_empty() + || select.having.is_some() + || select.distinct + || !select.distinct_on.is_empty() + || select.from.len() != 1 + { + return None; + } + let filter = select.filter.as_ref()?; + let crate::sql::ast::FromItem::Join { + left, + right, + kind: crate::sql::ast::JoinKind::Inner, + constraint, + } = &select.from[0] + else { + return None; + }; + let crate::sql::ast::FromItem::Table { + name: left_name, + alias: left_alias, + } = &**left + else { + return None; + }; + let crate::sql::ast::FromItem::Table { + name: right_name, + alias: right_alias, + } = &**right + else { + return None; + }; + if runtime.temp_table_schema(left_name).is_some() + || runtime.temp_table_schema(right_name).is_some() + || runtime.catalog.views.keys().any(|view_name| { + identifiers_equal(view_name, left_name) || identifiers_equal(view_name, right_name) + }) + { + return None; + } + let left_schema = runtime.catalog.table(left_name)?; + let right_schema = runtime.catalog.table(right_name)?; + let left_rowid_column = row_id_alias_column_name(left_schema)?; + let right_rowid_column = row_id_alias_column_name(right_schema)?; + + let (join_a, join_b) = prepared_join_column_equality(constraint)?; + let join_a_side = + prepared_join_column_side(join_a.0, left_name, left_alias, right_name, right_alias)?; + let join_b_side = + prepared_join_column_side(join_b.0, left_name, left_alias, right_name, right_alias)?; + let (left_join_column, right_join_column) = match (join_a_side, join_b_side) { + (SimpleJoinProjectionSide::Left, SimpleJoinProjectionSide::Right) => { + (join_a.1, join_b.1) + } + (SimpleJoinProjectionSide::Right, SimpleJoinProjectionSide::Left) => { + (join_b.1, join_a.1) + } + _ => return None, + }; + if !identifiers_equal(left_join_column, left_rowid_column) + || !identifiers_equal(right_join_column, right_rowid_column) + { + return None; + } + + let (filter_table, filter_column, param_index) = prepared_join_filter_param(filter)?; + let filter_side = prepared_join_column_side( + filter_table, + left_name, + left_alias, + right_name, + right_alias, + )?; + match filter_side { + SimpleJoinProjectionSide::Left + if !identifiers_equal(filter_column, left_rowid_column) => + { + return None; + } + SimpleJoinProjectionSide::Right + if !identifiers_equal(filter_column, right_rowid_column) => + { + return None; + } + _ => {} + } + let mut projections = Vec::with_capacity(select.projection.len()); + let mut left_projection_indexes = Vec::new(); + let mut right_projection_indexes = Vec::new(); + let mut column_names = Vec::with_capacity(select.projection.len()); + for item in &select.projection { + let crate::sql::ast::SelectItem::Expr { expr, alias } = item else { + return None; + }; + let crate::sql::ast::Expr::Column { table, column } = expr else { + return None; + }; + let side = prepared_join_column_side( + table.as_deref(), + left_name, + left_alias, + right_name, + right_alias, + )?; + let schema = match side { + SimpleJoinProjectionSide::Left => left_schema, + SimpleJoinProjectionSide::Right => right_schema, + }; + let index = schema + .columns + .iter() + .position(|candidate| identifiers_equal(&candidate.name, column))?; + let projected_index = match side { + SimpleJoinProjectionSide::Left => { + push_prepared_join_projection_index(&mut left_projection_indexes, index) + } + SimpleJoinProjectionSide::Right => { + push_prepared_join_projection_index(&mut right_projection_indexes, index) + } + }; + projections.push(ResolvedSimpleJoinProjection { + side, + index: projected_index, + }); + column_names.push(alias.clone().unwrap_or_else(|| column.clone())); + } + + Some(PreparedSimpleRowIdJoinProjection { + left_table_name: left_schema.name.clone(), + right_table_name: right_schema.name.clone(), + left_projection_indexes, + right_projection_indexes, + projections, + column_names: Arc::from(column_names), + param_index, + }) + } + + fn prepared_simple_scalar_filtered_aggregate( + statement: &SqlStatement, + runtime: &EngineRuntime, + ) -> Option { + let SqlStatement::Query(query) = statement else { + return None; + }; + if !query.ctes.is_empty() + || !query.order_by.is_empty() + || query.limit.is_some() + || query.offset.is_some() + { + return None; + } + let crate::sql::ast::QueryBody::Select(select) = &query.body else { + return None; + }; + if select.distinct + || !select.distinct_on.is_empty() + || !select.group_by.is_empty() + || select.having.is_some() + || select.from.len() != 1 + || select.projection.len() != 2 + { + return None; + } + let crate::sql::ast::FromItem::Table { name, alias } = &select.from[0] else { + return None; + }; + if runtime.temp_table_schema(name).is_some() || runtime .catalog .views .keys() - .any(|view_name| identifiers_equal(view_name, plan.table_name)) + .any(|view_name| identifiers_equal(view_name, name)) { return None; } - let table = runtime.catalog.table(plan.table_name)?; - if !row_id_alias_column_name(table) - .is_some_and(|column_name| identifiers_equal(column_name, plan.filter_column)) - { + let table = runtime.catalog.table(name)?; + if !prepared_table_generated_columns_are_stored(table) { return None; } - - let mut projection_indexes = Vec::with_capacity(plan.projection_columns.len()); - let mut column_names = Vec::with_capacity(plan.projection_columns.len()); - for projection_column in plan.projection_columns { - let index = table - .columns - .iter() - .position(|column| identifiers_equal(&column.name, projection_column))?; - projection_indexes.push(index); - column_names.push(projection_column.to_string()); + let param_index = prepared_scalar_filter_param(select.filter.as_ref()?, name, alias)?; + let mut saw_count = false; + let mut saw_sum = false; + for item in &select.projection { + let crate::sql::ast::SelectItem::Expr { expr, .. } = item else { + return None; + }; + if prepared_scalar_count_star(expr) { + saw_count = true; + continue; + } + if let Some(sum_column) = prepared_scalar_sum_column(expr, name, alias) { + if table + .columns + .iter() + .any(|column| identifiers_equal(&column.name, sum_column)) + { + saw_sum = true; + continue; + } + } + return None; } - - Some(PreparedSimpleRowIdProjection { + if !saw_count || !saw_sum { + return None; + } + Some(PreparedSimpleScalarFilteredAggregate { table_name: table.name.clone(), - projection_indexes, - column_names, - param_index: plan.param_index, + param_index, + cache: Arc::new(Mutex::new(PreparedScalarAggregateCache::default())), }) } @@ -6211,14 +6805,17 @@ impl Db { .inner .last_seen_checkpoint_epoch .load(Ordering::Acquire); + let writer_last_commit_lsn = self.inner.writer_last_commit_lsn.load(Ordering::Acquire); if latest_lsn <= last_runtime_lsn && latest_checkpoint_epoch == last_seen_checkpoint_epoch { return Ok(()); } + let mut checkpoint_lsn_after_refresh = None; if latest_checkpoint_epoch != last_seen_checkpoint_epoch { let cached_header = self.inner.pager.header_snapshot()?; let on_disk_header = self.inner.pager.header_from_disk()?; + checkpoint_lsn_after_refresh = Some(on_disk_header.last_checkpoint_lsn); if on_disk_header.last_checkpoint_lsn != cached_header.last_checkpoint_lsn { self.inner.pager.refresh_from_disk(on_disk_header)?; } @@ -6227,10 +6824,12 @@ impl Db { .store(latest_checkpoint_epoch, Ordering::Release); } - let writer_last_commit_lsn = self.inner.writer_last_commit_lsn.load(Ordering::Acquire); if last_runtime_lsn > 0 && writer_last_commit_lsn > 0 + && last_runtime_lsn >= writer_last_commit_lsn && latest_lsn <= writer_last_commit_lsn + && checkpoint_lsn_after_refresh + .is_none_or(|checkpoint_lsn| checkpoint_lsn <= last_runtime_lsn) { // A checkpoint can legally fold this handle's last committed WAL // history back into the database file and reset the live WAL end @@ -6593,6 +7192,14 @@ impl Db { } fn redefer_persisted_tables(&self, names: &[&str]) -> Result<()> { + self.redefer_persisted_tables_inner(names, true) + } + + fn redefer_persisted_tables_inner( + &self, + names: &[&str], + release_heap_after_drop: bool, + ) -> Result<()> { if !self.inner.config.defer_table_materialization || names.is_empty() { return Ok(()); } @@ -6603,7 +7210,9 @@ impl Db { .map_err(|_| DbError::internal("engine runtime lock poisoned"))?; runtime.redefer_persisted_tables(names); drop(runtime); - self.release_freed_heap_after_paged_row_source_drop(); + if release_heap_after_drop { + self.release_freed_heap_after_paged_row_source_drop(); + } Ok(()) } @@ -6613,9 +7222,18 @@ impl Db { && !self.inner.config.retain_paged_row_sources_after_commit } + fn runtime_should_redefer_persisted_tables_after_write( + &self, + runtime: &EngineRuntime, + names: &[&str], + ) -> bool { + self.should_redefer_paged_row_sources_after_write() + && runtime.has_redeferable_persisted_tables(names) + } + fn redefer_persisted_tables_after_write(&self, names: &[&str]) -> Result<()> { if self.should_redefer_paged_row_sources_after_write() { - self.redefer_persisted_tables(names) + self.redefer_persisted_tables_inner(names, false) } else { Ok(()) } @@ -12551,7 +13169,10 @@ impl Db { } fn publish_reactive_commit(&self, pending: Option, committed_lsn: u64) { - if let (Some(pending), Some(hub)) = (pending, self.reactive_hub_if_available()) { + let Some(pending) = pending else { + return; + }; + if let Some(hub) = self.reactive_hub_if_available() { hub.publish(pending, committed_lsn); } } @@ -14029,6 +14650,16 @@ struct SimpleRowIdProjectionSqlPlan<'a> { param_index: usize, } +#[derive(Clone, Debug, Eq, PartialEq)] +struct SimpleRowIdRangeProjectionSqlPlan<'a> { + table_name: &'a str, + projection_columns: Vec<&'a str>, + filter_column: &'a str, + lower_bound: Option, + upper_bound: Option, + limit_param_index: usize, +} + fn simple_single_statement_fast_path_sql(sql: &str) -> Option<&str> { let trimmed = sql.trim(); if trimmed.is_empty() { @@ -14054,62 +14685,473 @@ fn parse_simple_count_star_sql(sql: &str) -> Option> { if !is_simple_sql_identifier(table_name) { return None; } - Some(SimpleCountSqlPlan { table_name }) + Some(SimpleCountSqlPlan { table_name }) +} + +fn parse_simple_row_id_projection_sql(sql: &str) -> Option> { + let trimmed = sql.trim(); + if !trimmed.is_ascii() { + return None; + } + if trimmed.len() <= 7 || !trimmed[..7].eq_ignore_ascii_case("select ") { + return None; + } + let from_index = find_ascii_case_insensitive(trimmed, " from ")?; + let where_marker = " where "; + let where_index = find_ascii_case_insensitive(&trimmed[from_index + 6..], where_marker) + .map(|index| from_index + 6 + index)?; + let filter_tail = &trimmed[where_index + where_marker.len()..]; + if contains_ascii_case_insensitive(filter_tail, " order ") + || contains_ascii_case_insensitive(filter_tail, " limit ") + || contains_ascii_case_insensitive(filter_tail, " group ") + { + return None; + } + + let projection_sql = trimmed[6..from_index].trim(); + let table_name = trimmed[from_index + 6..where_index].trim(); + let filter_sql = trimmed[where_index + where_marker.len()..].trim(); + if projection_sql.is_empty() || !is_simple_sql_identifier(table_name) { + return None; + } + let mut projection_columns = Vec::new(); + for column in projection_sql.split(',') { + let column = column.trim(); + if !is_simple_sql_identifier(column) { + return None; + } + projection_columns.push(column); + } + let (left, right) = filter_sql.split_once('=')?; + let left = left.trim(); + let right = right.trim(); + let (filter_column, param_index) = if let Some(param_index) = parse_positional_param(right) { + (left, param_index) + } else if let Some(param_index) = parse_positional_param(left) { + (right, param_index) + } else { + return None; + }; + if !is_simple_sql_identifier(filter_column) { + return None; + } + Some(SimpleRowIdProjectionSqlPlan { + table_name, + projection_columns, + filter_column, + param_index, + }) +} + +fn parse_simple_row_id_range_projection_sql( + sql: &str, +) -> Option> { + let trimmed = sql.trim(); + if !trimmed.is_ascii() { + return None; + } + if trimmed.len() <= 7 || !trimmed[..7].eq_ignore_ascii_case("select ") { + return None; + } + let from_index = find_ascii_case_insensitive(trimmed, " from ")?; + let where_marker = " where "; + let where_index = find_ascii_case_insensitive(&trimmed[from_index + 6..], where_marker) + .map(|index| from_index + 6 + index)?; + let projection_sql = trimmed[6..from_index].trim(); + let table_name = trimmed[from_index + 6..where_index].trim(); + if projection_sql.is_empty() || !is_simple_sql_identifier(table_name) { + return None; + } + + let filter_tail = trimmed[where_index + where_marker.len()..].trim(); + let limit_marker = " limit "; + let limit_index = find_ascii_case_insensitive(filter_tail, limit_marker)?; + let before_limit = filter_tail[..limit_index].trim(); + let limit_sql = filter_tail[limit_index + limit_marker.len()..].trim(); + if limit_sql.is_empty() + || contains_ascii_case_insensitive(limit_sql, " offset ") + || limit_sql.split_ascii_whitespace().count() != 1 + { + return None; + } + let limit_param_index = parse_positional_param(limit_sql)?; + + let order_marker = " order by "; + let (filter_sql, order_column) = + if let Some(order_index) = find_ascii_case_insensitive(before_limit, order_marker) { + let filter_sql = before_limit[..order_index].trim(); + let order_sql = before_limit[order_index + order_marker.len()..].trim(); + let mut order_parts = order_sql.split_ascii_whitespace(); + let order_column = order_parts.next()?; + if !is_simple_sql_identifier(order_column) { + return None; + } + match order_parts.next() { + None => {} + Some(direction) if direction.eq_ignore_ascii_case("asc") => {} + _ => return None, + } + if order_parts.next().is_some() { + return None; + } + (filter_sql, Some(order_column)) + } else { + (before_limit, None) + }; + if filter_sql.is_empty() + || contains_ascii_case_insensitive(filter_sql, " group ") + || contains_ascii_case_insensitive(filter_sql, " having ") + { + return None; + } + + let mut projection_columns = Vec::new(); + for column in projection_sql.split(',') { + let column = column.trim(); + if !is_simple_sql_identifier(column) { + return None; + } + projection_columns.push(column); + } + + let mut filter_column = None; + let mut lower_bound = None; + let mut upper_bound = None; + let mut remaining = filter_sql.trim(); + loop { + let (term, rest) = if let Some(and_index) = find_ascii_case_insensitive(remaining, " and ") + { + ( + remaining[..and_index].trim(), + Some(remaining[and_index + 5..].trim()), + ) + } else { + (remaining.trim(), None) + }; + let (term_column, bound_kind, param_index) = parse_simple_range_param_term(term)?; + if let Some(existing_column) = filter_column { + if !identifiers_equal(existing_column, term_column) { + return None; + } + } else { + filter_column = Some(term_column); + } + let bound = PreparedSimpleRangeBoundParam { + inclusive: bound_kind.inclusive(), + param_index, + }; + match bound_kind { + SimpleRangeParamBoundKind::Lower(_) => { + if lower_bound.replace(bound).is_some() { + return None; + } + } + SimpleRangeParamBoundKind::Upper(_) => { + if upper_bound.replace(bound).is_some() { + return None; + } + } + } + let Some(rest) = rest else { + break; + }; + if rest.is_empty() { + return None; + } + remaining = rest; + } + let filter_column = filter_column?; + if lower_bound.is_none() && upper_bound.is_none() { + return None; + } + if order_column.is_some_and(|column| !identifiers_equal(column, filter_column)) { + return None; + } + + Some(SimpleRowIdRangeProjectionSqlPlan { + table_name, + projection_columns, + filter_column, + lower_bound, + upper_bound, + limit_param_index, + }) +} + +type PreparedJoinColumnRef<'a> = (Option<&'a str>, &'a str); +type PreparedJoinColumnEquality<'a> = (PreparedJoinColumnRef<'a>, PreparedJoinColumnRef<'a>); + +fn prepared_join_column_equality( + constraint: &crate::sql::ast::JoinConstraint, +) -> Option> { + let crate::sql::ast::JoinConstraint::On(expr) = constraint else { + return None; + }; + let crate::sql::ast::Expr::Binary { + left, + op: crate::sql::ast::BinaryOp::Eq, + right, + } = expr + else { + return None; + }; + let crate::sql::ast::Expr::Column { + table: left_table, + column: left_column, + } = left.as_ref() + else { + return None; + }; + let crate::sql::ast::Expr::Column { + table: right_table, + column: right_column, + } = right.as_ref() + else { + return None; + }; + Some(( + (left_table.as_deref(), left_column.as_str()), + (right_table.as_deref(), right_column.as_str()), + )) +} + +fn prepared_join_filter_param( + filter: &crate::sql::ast::Expr, +) -> Option<(Option<&str>, &str, usize)> { + let crate::sql::ast::Expr::Binary { + left, + op: crate::sql::ast::BinaryOp::Eq, + right, + } = filter + else { + return None; + }; + if let crate::sql::ast::Expr::Column { table, column } = left.as_ref() { + let crate::sql::ast::Expr::Parameter(param_index) = right.as_ref() else { + return None; + }; + return Some(( + table.as_deref(), + column.as_str(), + param_index.checked_sub(1)?, + )); + } + if let crate::sql::ast::Expr::Column { table, column } = right.as_ref() { + let crate::sql::ast::Expr::Parameter(param_index) = left.as_ref() else { + return None; + }; + return Some(( + table.as_deref(), + column.as_str(), + param_index.checked_sub(1)?, + )); + } + None +} + +fn prepared_join_column_side( + table: Option<&str>, + left_name: &str, + left_alias: &Option, + right_name: &str, + right_alias: &Option, +) -> Option { + let table = table?; + let matches_left = identifiers_equal(table, left_name) + || left_alias + .as_deref() + .is_some_and(|alias| identifiers_equal(table, alias)); + let matches_right = identifiers_equal(table, right_name) + || right_alias + .as_deref() + .is_some_and(|alias| identifiers_equal(table, alias)); + match (matches_left, matches_right) { + (true, false) => Some(SimpleJoinProjectionSide::Left), + (false, true) => Some(SimpleJoinProjectionSide::Right), + _ => None, + } +} + +fn push_prepared_join_projection_index(indexes: &mut Vec, index: usize) -> usize { + if let Some(position) = indexes.iter().position(|candidate| *candidate == index) { + position + } else { + indexes.push(index); + indexes.len() - 1 + } +} + +fn prepared_table_generated_columns_are_stored(table: &TableSchema) -> bool { + table + .columns + .iter() + .all(|column| column.generated_sql.is_none() || column.generated_stored) +} + +fn prepared_scalar_count_star(expr: &crate::sql::ast::Expr) -> bool { + let crate::sql::ast::Expr::Aggregate { + name, + args, + distinct, + star, + order_by, + within_group, + } = expr + else { + return false; + }; + name.eq_ignore_ascii_case("count") + && args.is_empty() + && !*distinct + && *star + && order_by.is_empty() + && !*within_group +} + +fn prepared_scalar_sum_column<'a>( + expr: &'a crate::sql::ast::Expr, + table_name: &str, + alias: &Option, +) -> Option<&'a str> { + let crate::sql::ast::Expr::Aggregate { + name, + args, + distinct, + star, + order_by, + within_group, + } = expr + else { + return None; + }; + if !name.eq_ignore_ascii_case("sum") + || *distinct + || *star + || !order_by.is_empty() + || *within_group + || args.len() != 1 + { + return None; + } + let crate::sql::ast::Expr::Column { table, column } = &args[0] else { + return None; + }; + if prepared_scalar_column_matches_table(table.as_deref(), table_name, alias) { + Some(column.as_str()) + } else { + None + } } -fn parse_simple_row_id_projection_sql(sql: &str) -> Option> { - let trimmed = sql.trim(); - if !trimmed.is_ascii() { +fn prepared_scalar_filter_param( + filter: &crate::sql::ast::Expr, + table_name: &str, + alias: &Option, +) -> Option { + let crate::sql::ast::Expr::Binary { + left, + op: crate::sql::ast::BinaryOp::Eq, + right, + } = filter + else { return None; + }; + if let crate::sql::ast::Expr::Column { table, .. } = left.as_ref() { + if prepared_scalar_column_matches_table(table.as_deref(), table_name, alias) { + let crate::sql::ast::Expr::Parameter(param_index) = right.as_ref() else { + return None; + }; + return param_index.checked_sub(1); + } } - if trimmed.len() <= 7 || !trimmed[..7].eq_ignore_ascii_case("select ") { - return None; + if let crate::sql::ast::Expr::Column { table, .. } = right.as_ref() { + if prepared_scalar_column_matches_table(table.as_deref(), table_name, alias) { + let crate::sql::ast::Expr::Parameter(param_index) = left.as_ref() else { + return None; + }; + return param_index.checked_sub(1); + } } - let from_index = find_ascii_case_insensitive(trimmed, " from ")?; - let where_marker = " where "; - let where_index = find_ascii_case_insensitive(&trimmed[from_index + 6..], where_marker) - .map(|index| from_index + 6 + index)?; - let filter_tail = &trimmed[where_index + where_marker.len()..]; - if contains_ascii_case_insensitive(filter_tail, " order ") - || contains_ascii_case_insensitive(filter_tail, " limit ") - || contains_ascii_case_insensitive(filter_tail, " group ") - { - return None; + None +} + +fn prepared_scalar_column_matches_table( + table: Option<&str>, + table_name: &str, + alias: &Option, +) -> bool { + table.is_none_or(|qualifier| { + identifiers_equal(qualifier, table_name) + || alias + .as_deref() + .is_some_and(|alias| identifiers_equal(qualifier, alias)) + }) +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum SimpleRangeParamBoundKind { + Lower(bool), + Upper(bool), +} + +impl SimpleRangeParamBoundKind { + fn inclusive(self) -> bool { + match self { + Self::Lower(inclusive) | Self::Upper(inclusive) => inclusive, + } } +} - let projection_sql = trimmed[6..from_index].trim(); - let table_name = trimmed[from_index + 6..where_index].trim(); - let filter_sql = trimmed[where_index + where_marker.len()..].trim(); - if projection_sql.is_empty() || !is_simple_sql_identifier(table_name) { - return None; +fn parse_simple_range_param_term(term: &str) -> Option<(&str, SimpleRangeParamBoundKind, usize)> { + let (left, op, right) = split_simple_range_operator(term)?; + if is_simple_sql_identifier(left) { + let param_index = parse_positional_param(right)?; + let bound = match op { + SimpleRangeOperator::Gt => SimpleRangeParamBoundKind::Lower(false), + SimpleRangeOperator::GtEq => SimpleRangeParamBoundKind::Lower(true), + SimpleRangeOperator::Lt => SimpleRangeParamBoundKind::Upper(false), + SimpleRangeOperator::LtEq => SimpleRangeParamBoundKind::Upper(true), + }; + return Some((left, bound, param_index)); } - let mut projection_columns = Vec::new(); - for column in projection_sql.split(',') { - let column = column.trim(); - if !is_simple_sql_identifier(column) { - return None; - } - projection_columns.push(column); + if is_simple_sql_identifier(right) { + let param_index = parse_positional_param(left)?; + let bound = match op { + SimpleRangeOperator::Gt => SimpleRangeParamBoundKind::Upper(false), + SimpleRangeOperator::GtEq => SimpleRangeParamBoundKind::Upper(true), + SimpleRangeOperator::Lt => SimpleRangeParamBoundKind::Lower(false), + SimpleRangeOperator::LtEq => SimpleRangeParamBoundKind::Lower(true), + }; + return Some((right, bound, param_index)); } - let (left, right) = filter_sql.split_once('=')?; - let left = left.trim(); - let right = right.trim(); - let (filter_column, param_index) = if let Some(param_index) = parse_positional_param(right) { - (left, param_index) - } else if let Some(param_index) = parse_positional_param(left) { - (right, param_index) - } else { - return None; - }; - if !is_simple_sql_identifier(filter_column) { - return None; + None +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum SimpleRangeOperator { + Gt, + GtEq, + Lt, + LtEq, +} + +fn split_simple_range_operator(term: &str) -> Option<(&str, SimpleRangeOperator, &str)> { + for (token, op) in [ + (">=", SimpleRangeOperator::GtEq), + ("<=", SimpleRangeOperator::LtEq), + (">", SimpleRangeOperator::Gt), + ("<", SimpleRangeOperator::Lt), + ] { + if let Some(index) = term.find(token) { + let left = term[..index].trim(); + let right = term[index + token.len()..].trim(); + if left.is_empty() || right.is_empty() { + return None; + } + return Some((left, op, right)); + } } - Some(SimpleRowIdProjectionSqlPlan { - table_name, - projection_columns, - filter_column, - param_index, - }) + None } fn find_ascii_case_insensitive(haystack: &str, needle: &str) -> Option { @@ -16384,8 +17426,8 @@ mod tests { use super::{ parse_simple_count_star_sql, parse_simple_row_id_projection_sql, - simple_single_statement_fast_path_sql, split_sql_batch, PreparedInsertCache, - StatementCache, TempSchemaState, + parse_simple_row_id_range_projection_sql, simple_single_statement_fast_path_sql, + split_sql_batch, PreparedInsertCache, StatementCache, TempSchemaState, }; #[derive(Debug)] @@ -16630,6 +17672,51 @@ mod tests { .is_none()); } + #[test] + fn simple_row_id_range_projection_sql_parser_extracts_bounds_and_limit() { + let plan = parse_simple_row_id_range_projection_sql( + "SELECT name FROM users WHERE id >= $1 AND id < $2 ORDER BY id LIMIT $3", + ) + .expect("simple rowid range projection"); + assert_eq!(plan.table_name, "users"); + assert_eq!(plan.projection_columns, vec!["name"]); + assert_eq!(plan.filter_column, "id"); + assert_eq!( + plan.lower_bound, + Some(super::PreparedSimpleRangeBoundParam { + inclusive: true, + param_index: 0 + }) + ); + assert_eq!( + plan.upper_bound, + Some(super::PreparedSimpleRangeBoundParam { + inclusive: false, + param_index: 1 + }) + ); + assert_eq!(plan.limit_param_index, 2); + + let reversed = parse_simple_row_id_range_projection_sql( + "SELECT name FROM users WHERE $1 <= id AND $2 > id ORDER BY id ASC LIMIT $3", + ) + .expect("reversed range predicates"); + assert_eq!(reversed.lower_bound, plan.lower_bound); + assert_eq!(reversed.upper_bound, plan.upper_bound); + assert!(parse_simple_row_id_range_projection_sql( + "SELECT name FROM users WHERE id >= $1 AND id < $2 ORDER BY id DESC LIMIT $3" + ) + .is_none()); + assert!(parse_simple_row_id_range_projection_sql( + "SELECT name FROM users WHERE id >= $1 AND id < $2 ORDER BY name LIMIT $3" + ) + .is_none()); + assert!(parse_simple_row_id_range_projection_sql( + "SELECT upper(name) FROM users WHERE id >= $1 AND id < $2 ORDER BY id LIMIT $3" + ) + .is_none()); + } + #[test] fn single_statement_fast_path_accepts_optional_trailing_semicolon_only() { assert_eq!( @@ -18250,8 +19337,23 @@ mod tests { .expect("create table"); db.execute("CREATE INDEX docs_name_idx ON docs(name)") .expect("create index"); - db.execute("INSERT INTO docs VALUES (1, 'old'), (2, 'stable')") - .expect("insert rows"); + let mut txn = db.transaction().expect("begin seed txn"); + let insert = txn + .prepare("INSERT INTO docs VALUES ($1, $2)") + .expect("prepare seed insert"); + let large_name = "x".repeat(2048); + for i in 0_i64..40_i64 { + insert + .execute_in( + &mut txn, + &[ + Value::Int64(i + 1), + Value::Text(format!("name-{i}-{large_name}")), + ], + ) + .expect("insert row"); + } + txn.commit().expect("commit seed rows"); let mut stale_runtime = db .runtime_for_metadata_inspection() .expect("runtime for stale setup"); @@ -18707,7 +19809,11 @@ mod tests { .expect("begin reader"); let older_snapshot_lsn = reader.snapshot_lsn(); - db.execute("INSERT INTO t VALUES (1, 'newer')") + let insert = db + .prepare("INSERT INTO t VALUES ($1, $2)") + .expect("prepare newer row insert"); + insert + .execute(&[Value::Int64(1), Value::Text("x".repeat(70_000))]) .expect("insert newer row"); let newer_snapshot_lsn = db.inner.wal.latest_snapshot(); assert!( @@ -18778,13 +19884,14 @@ mod tests { let insert = txn .prepare("INSERT INTO bench VALUES ($1, $2, $3)") .expect("prepare insert"); + let large_body = "x".repeat(2048); for i in 0_i64..128_i64 { insert .execute_in( &mut txn, &[ Value::Int64(i), - Value::Text(format!("value_{i}")), + Value::Text(format!("value_{i}_{large_body}")), Value::Float64(i as f64), ], ) @@ -18825,7 +19932,7 @@ mod tests { result.rows()[0].values(), &[ Value::Int64(42), - Value::Text("value_42".to_string()), + Value::Text(format!("value_42_{large_body}")), Value::Float64(42.0), ] ); @@ -19929,18 +21036,169 @@ mod tests { &db.execute("SELECT COUNT(*) FROM docs") .expect("count docs rows") ), - 95 + 95 + ); + } + + #[test] + fn paged_row_storage_prepared_insert_after_reopen_preserves_untouched_chunks() { + let tempdir = TempDir::new().expect("tempdir"); + let path = tempdir + .path() + .join("paged-row-storage-insert-after-reopen.ddb"); + let config = DbConfig { + persistent_pk_index: true, + paged_row_storage: true, + ..DbConfig::default() + }; + + { + let db = Db::open_or_create(&path, config.clone()).expect("open db"); + db.execute("CREATE TABLE docs (id INTEGER PRIMARY KEY, n INTEGER, body TEXT)") + .expect("create docs table"); + let large_body = "x".repeat(2048); + let mut txn = db.transaction().expect("begin txn"); + let insert = txn + .prepare("INSERT INTO docs VALUES ($1, $2, $3)") + .expect("prepare insert"); + for i in 0_i64..96_i64 { + insert + .execute_in( + &mut txn, + &[ + Value::Int64(i + 1), + Value::Int64(i), + Value::Text(large_body.clone()), + ], + ) + .expect("insert row"); + } + txn.commit().expect("commit rows"); + db.checkpoint().expect("checkpoint"); + } + + let untouched_chunk_pointers = { + let db = Db::open_or_create(&path, config.clone()).expect("reopen db"); + let json_open = db.inspect_storage_state_json().expect("json at reopen"); + assert!( + json_open.contains("\"deferred_table_count\":1"), + "expected paged-backed table to stay deferred at reopen, got: {json_open}" + ); + + let untouched_chunk_pointers = { + let runtime_before = db.inner.engine.read().expect("engine runtime lock"); + let docs_before = runtime_before + .persisted_tables + .get("docs") + .expect("persisted docs before insert"); + let page_store = PagerReadStore { db: &db }; + let manifest_before = read_overflow(&page_store, docs_before.pointer) + .expect("read paged manifest before insert"); + let manifest_before = decode_paged_table_manifest_payload(&manifest_before) + .expect("decode paged manifest before insert"); + assert!( + manifest_before.chunks.len() > 2, + "expected multiple chunks before insert" + ); + manifest_before + .chunks + .iter() + .take(manifest_before.chunks.len() - 1) + .map(|chunk| chunk.pointer) + .collect::>() + }; + + let insert = db + .prepare("INSERT INTO docs VALUES ($1, $2, $3)") + .expect("prepare insert after reopen"); + insert + .execute(&[ + Value::Int64(97), + Value::Int64(9600), + Value::Text("z".repeat(2048)), + ]) + .expect("insert row after reopen"); + + untouched_chunk_pointers + }; + + let db = Db::open_or_create(&path, config).expect("reopen mutated db"); + let preserved_untouched = { + let runtime_after = db.inner.engine.read().expect("engine runtime lock"); + let docs_after = runtime_after + .persisted_tables + .get("docs") + .expect("persisted docs after insert"); + assert!( + docs_after.pointer.is_table_paged_manifest(), + "inserted table should remain paged" + ); + assert!( + docs_after.pk_index_root.is_some(), + "inserted paged table should retain persistent pk locator root" + ); + let page_store = PagerReadStore { db: &db }; + let manifest_after = read_overflow(&page_store, docs_after.pointer) + .expect("read paged manifest after insert"); + let manifest_after = decode_paged_table_manifest_payload(&manifest_after) + .expect("decode paged manifest after insert"); + assert_eq!( + manifest_after + .chunks + .iter() + .map(|chunk| chunk.row_count) + .sum::(), + 97, + "paged manifest row counts should reflect the insert" + ); + manifest_after + .chunks + .iter() + .filter_map(|chunk| { + untouched_chunk_pointers + .contains(&chunk.pointer) + .then_some(chunk.pointer) + }) + .collect::>() + }; + assert_eq!( + preserved_untouched, untouched_chunk_pointers, + "untouched paged chunks should retain their original pointers after reopen-time insert" + ); + + let inserted = db + .execute("SELECT n FROM docs WHERE id = 97") + .expect("point lookup after reopen insert"); + assert_eq!(scalar_i64(&inserted), 9600); + assert_eq!( + scalar_i64( + &db.execute("SELECT COUNT(*) FROM docs") + .expect("count docs rows") + ), + 97 + ); + let json_after = db + .inspect_storage_state_json() + .expect("json after insert reopen"); + assert!( + json_after.contains("\"loaded_table_count\":0"), + "expected inserted paged table to remain off the resident path, got: {json_after}" + ); + assert!( + json_after.contains("\"deferred_table_count\":1"), + "expected inserted paged table to remain deferred after reopen, got: {json_after}" ); } #[test] - fn paged_row_storage_prepared_insert_after_reopen_preserves_untouched_chunks() { + fn paged_row_storage_prepared_insert_after_reopen_preserves_untouched_chunks_without_persistent_pk_index( + ) { let tempdir = TempDir::new().expect("tempdir"); let path = tempdir .path() - .join("paged-row-storage-insert-after-reopen.ddb"); + .join("paged-row-storage-insert-after-reopen-without-pk-index.ddb"); let config = DbConfig { - persistent_pk_index: true, + persistent_pk_index: false, paged_row_storage: true, ..DbConfig::default() }; @@ -20027,8 +21285,8 @@ mod tests { "inserted table should remain paged" ); assert!( - docs_after.pk_index_root.is_some(), - "inserted paged table should retain persistent pk locator root" + docs_after.pk_index_root.is_none(), + "non-persistent pk config should not write a pk locator root" ); let page_store = PagerReadStore { db: &db }; let manifest_after = read_overflow(&page_store, docs_after.pointer) @@ -22704,11 +23962,15 @@ mod tests { let insert = txn .prepare("INSERT INTO docs VALUES ($1, $2)") .expect("prepare insert"); + let large_body = "x".repeat(2048); for i in 0_i64..64_i64 { insert .execute_in( &mut txn, - &[Value::Int64(i + 1), Value::Text(format!("body-{i}"))], + &[ + Value::Int64(i + 1), + Value::Text(format!("body-{i}-{large_body}")), + ], ) .expect("insert row"); } @@ -22767,11 +24029,15 @@ mod tests { let insert = txn .prepare("INSERT INTO docs VALUES ($1, $2)") .expect("prepare insert"); + let large_body = "x".repeat(2048); for i in 0_i64..64_i64 { insert .execute_in( &mut txn, - &[Value::Int64(i + 1), Value::Text(format!("body-{i}"))], + &[ + Value::Int64(i + 1), + Value::Text(format!("body-{i}-{large_body}")), + ], ) .expect("insert row"); } @@ -22812,7 +24078,16 @@ mod tests { db.execute("CREATE UNIQUE INDEX Libraries_kind_unique ON Libraries (kind) WHERE kind != 3") .expect("create partial unique index"); - db.execute("INSERT INTO Libraries (id, name, kind) VALUES (11, 'Storage One', 3)") + let large_name = "x".repeat(70_000); + let insert = db + .prepare("INSERT INTO Libraries (id, name, kind) VALUES ($1, $2, $3)") + .expect("prepare first excluded row"); + insert + .execute(&[ + Value::Int64(11), + Value::Text(format!("Storage One {large_name}")), + Value::Int64(3), + ]) .expect("insert excluded row"); let json_after_first = db .inspect_storage_state_json() @@ -22823,10 +24098,14 @@ mod tests { ); let prepared = db - .prepare("INSERT INTO Libraries (id, name, kind) VALUES (12, 'Storage Two', 3)") + .prepare("INSERT INTO Libraries (id, name, kind) VALUES ($1, $2, $3)") .expect("prepare second excluded row after redefer"); prepared - .execute(&[]) + .execute(&[ + Value::Int64(12), + Value::Text(format!("Storage Two {large_name}")), + Value::Int64(3), + ]) .expect("execute second excluded row after redefer"); assert_eq!( scalar_i64( @@ -27229,6 +28508,183 @@ mod tests { ); } + #[test] + fn paged_row_storage_scalar_filtered_aggregate_keeps_deferred_table_unloaded() { + let tempdir = TempDir::new().expect("tempdir"); + let path = tempdir + .path() + .join("paged-row-storage-scalar-filtered-aggregate.ddb"); + let config = DbConfig { + paged_row_storage: true, + ..DbConfig::default() + }; + + { + let db = Db::open_or_create(&path, config.clone()).expect("create db"); + db.execute( + "CREATE TABLE orders ( + id INTEGER PRIMARY KEY, + user_id INTEGER, + amount FLOAT64, + body TEXT + )", + ) + .expect("create orders"); + let large_body = "x".repeat(2048); + let mut txn = db.transaction().expect("begin txn"); + let insert = txn + .prepare("INSERT INTO orders VALUES ($1, $2, $3, $4)") + .expect("prepare insert"); + for i in 0_i64..128_i64 { + insert + .execute_in( + &mut txn, + &[ + Value::Int64(i), + Value::Int64(i % 4), + Value::Float64(i as f64 + 0.5), + Value::Text(large_body.clone()), + ], + ) + .expect("insert row"); + } + txn.commit().expect("commit rows"); + db.checkpoint().expect("checkpoint"); + } + + let db = Db::open_or_create(&path, config).expect("reopen with paged storage"); + let aggregate = db + .prepare("SELECT COUNT(*), SUM(amount) FROM orders WHERE user_id = $1") + .expect("prepare aggregate"); + assert!( + aggregate.simple_scalar_filtered_aggregate.is_some(), + "expected prepared scalar aggregate cache plan" + ); + let result = aggregate + .execute(&[Value::Int64(2)]) + .expect("scalar filtered aggregate"); + assert_eq!(result.rows().len(), 1); + assert_eq!( + result.rows()[0].values(), + &[Value::Int64(32), Value::Float64(2064.0)] + ); + let plan = aggregate + .simple_scalar_filtered_aggregate + .as_ref() + .expect("prepared scalar aggregate plan"); + assert_eq!( + plan.cache.lock().expect("cache lock").entries.len(), + 1, + "expected first aggregate execution to populate the cache" + ); + let cached = aggregate + .execute(&[Value::Int64(2)]) + .expect("cached scalar filtered aggregate"); + assert_eq!(cached.rows(), result.rows()); + assert_eq!( + plan.cache.lock().expect("cache lock").entries.len(), + 1, + "expected repeated aggregate execution to reuse the cached key" + ); + + let json_after = db + .inspect_storage_state_json() + .expect("json after scalar filtered aggregate"); + assert!( + json_after.contains("\"loaded_table_count\":0"), + "expected scalar filtered aggregate to keep table deferred, got: {json_after}" + ); + assert!( + json_after.contains("\"deferred_table_count\":1"), + "expected scalar filtered aggregate to keep one deferred table, got: {json_after}" + ); + assert!( + json_after.contains("\"rows_in_memory_count\":0"), + "expected zero resident rows after scalar filtered aggregate, got: {json_after}" + ); + } + + #[test] + fn resident_scalar_filtered_aggregate_cache_invalidates_after_write() { + let tempdir = TempDir::new().expect("tempdir"); + let path = tempdir + .path() + .join("resident-scalar-filtered-aggregate-cache.ddb"); + let config = DbConfig { + paged_row_storage: false, + retain_paged_row_sources_after_commit: true, + wal_checkpoint_threshold_pages: 0, + wal_checkpoint_threshold_bytes: 0, + ..DbConfig::default() + }; + let db = Db::open_or_create(&path, config).expect("create db"); + db.execute("CREATE TABLE orders (id INTEGER PRIMARY KEY, user_id INTEGER, amount FLOAT64)") + .expect("create orders"); + let mut txn = db.transaction().expect("begin txn"); + let insert = txn + .prepare("INSERT INTO orders VALUES ($1, $2, $3)") + .expect("prepare insert"); + for i in 0_i64..16_i64 { + insert + .execute_in( + &mut txn, + &[ + Value::Int64(i), + Value::Int64(i % 4), + Value::Float64(i as f64 + 0.5), + ], + ) + .expect("insert row"); + } + txn.commit().expect("commit rows"); + + let aggregate = db + .prepare("SELECT COUNT(*), SUM(amount) FROM orders WHERE user_id = $1") + .expect("prepare aggregate"); + let plan = aggregate + .simple_scalar_filtered_aggregate + .as_ref() + .expect("prepared scalar aggregate plan"); + let result = aggregate + .execute(&[Value::Int64(2)]) + .expect("resident aggregate"); + assert_eq!( + result.rows()[0].values(), + &[Value::Int64(4), Value::Float64(34.0)] + ); + assert_eq!( + plan.cache.lock().expect("cache lock").entries.len(), + 1, + "expected resident aggregate execution to populate the cache" + ); + let cached = aggregate + .execute(&[Value::Int64(2)]) + .expect("cached resident aggregate"); + assert_eq!(cached.rows(), result.rows()); + assert_eq!( + plan.cache.lock().expect("cache lock").entries.len(), + 1, + "expected repeated resident aggregate execution to reuse the cache" + ); + + db.prepare("INSERT INTO orders VALUES ($1, $2, $3)") + .expect("prepare autocommit insert") + .execute(&[Value::Int64(100), Value::Int64(2), Value::Float64(10.0)]) + .expect("insert new matching row"); + let after_write = aggregate + .execute(&[Value::Int64(2)]) + .expect("resident aggregate after write"); + assert_eq!( + after_write.rows()[0].values(), + &[Value::Int64(5), Value::Float64(44.0)] + ); + assert_eq!( + plan.cache.lock().expect("cache lock").entries.len(), + 2, + "expected post-write aggregate to use a new snapshot cache key" + ); + } + #[test] fn paged_row_storage_grouped_numeric_aggregate_with_order_limit_offset_keeps_deferred_table_unloaded( ) { @@ -29256,11 +30712,12 @@ mod tests { let db = Db::open_or_create(":memory:", DbConfig::default()).expect("open db"); db.execute("CREATE TABLE m (id INTEGER PRIMARY KEY, label TEXT)") .expect("create m"); + let label = "value-with-enough-text-to-require-paged-row-storage".repeat(64); for i in 0..32 { - db.execute(&format!( - "INSERT INTO m (id, label) VALUES ({i}, 'value-{i}-with-some-text')" - )) - .expect("insert m"); + db.prepare("INSERT INTO m (id, label) VALUES ($1, $2)") + .expect("prepare insert") + .execute(&[Value::Int64(i), Value::Text(label.clone())]) + .expect("insert m"); } let json = db .inspect_storage_state_json() @@ -29285,9 +30742,9 @@ mod tests { json.contains("\"wal_on_disk_versions\":"), "missing wal_on_disk_versions: {json}" ); - // With default paged_row_storage: true, the table stays deferred - // after the autocommit insert. Only non-paged paths materialize - // table data into memory. + // With default paged_row_storage: true, tables large enough to need + // chunked storage stay deferred after autocommit inserts. Only small + // single-payload tables stay resident in memory. assert!( json.contains("\"deferred_table_count\":1"), "expected one deferred table with paged_row_storage=true, got: {json}" @@ -29300,6 +30757,78 @@ mod tests { ); } + #[test] + fn paged_row_storage_keeps_small_append_table_single_payload() { + let db = Db::open_or_create(":memory:", DbConfig::default()).expect("open db"); + db.execute("CREATE TABLE orders (id INTEGER PRIMARY KEY, user_id INTEGER, amount FLOAT64)") + .expect("create orders"); + let insert = db + .prepare("INSERT INTO orders (id, user_id, amount) VALUES ($1, $2, $3)") + .expect("prepare insert"); + for i in 0_i64..128_i64 { + insert + .execute(&[Value::Int64(i), Value::Int64(i % 16), Value::Float64(9.99)]) + .expect("insert order"); + } + + let runtime = db.inner.engine.read().expect("runtime lock"); + let state = runtime + .persisted_tables + .get("orders") + .expect("orders persisted state"); + assert!( + !state.pointer.is_table_paged_manifest(), + "small append-only tables should avoid paged manifest overhead" + ); + assert!( + runtime.tables.contains_key("orders"), + "small single-payload table should remain resident" + ); + assert_eq!(runtime.deferred_table_names().count(), 0); + } + + #[test] + fn paged_row_storage_converts_small_payload_after_chunk_threshold() { + let db = Db::open_or_create(":memory:", DbConfig::default()).expect("open db"); + db.execute("CREATE TABLE docs (id INTEGER PRIMARY KEY, body TEXT)") + .expect("create docs"); + let insert = db + .prepare("INSERT INTO docs (id, body) VALUES ($1, $2)") + .expect("prepare insert"); + let body = "x".repeat(2048); + for i in 0_i64..40_i64 { + insert + .execute(&[Value::Int64(i + 1), Value::Text(body.clone())]) + .expect("insert doc"); + } + + { + let runtime = db.inner.engine.read().expect("runtime lock"); + let state = runtime + .persisted_tables + .get("docs") + .expect("docs persisted state"); + assert!( + state.pointer.is_table_paged_manifest(), + "large append-only tables should convert to paged storage" + ); + assert!( + runtime + .deferred_table_names() + .any(|name| name.eq_ignore_ascii_case("docs")), + "converted paged table should be re-deferred after write" + ); + } + + assert_eq!( + scalar_i64( + &db.execute("SELECT COUNT(*) FROM docs") + .expect("count docs rows") + ), + 40 + ); + } + /// ADR 0143 Phase B: by default, re-opening a DB leaves persisted /// tables in the deferred set until the first SQL statement runs, /// then materializes them. @@ -29819,6 +31348,10 @@ mod tests { let prepared = db .prepare("SELECT name FROM users WHERE id >= $1 AND id < $2 ORDER BY id LIMIT $3") .expect("prepare range lookup"); + assert!( + prepared.simple_row_id_range_projection.is_some(), + "expected prepared range lookup to cache the row-id range plan" + ); let result = prepared .execute(&[Value::Int64(10), Value::Int64(20), Value::Int64(3)]) .expect("execute prepared range lookup"); @@ -29859,6 +31392,107 @@ mod tests { ); } + #[test] + fn prepared_row_id_join_uses_deferred_locator_cache() { + let tempdir = TempDir::new().expect("tempdir"); + let path = tempdir.path().join("prepared-deferred-row-id-join.ddb"); + let db = Db::open_or_create(&path, DbConfig::default()).expect("create db"); + db.execute("CREATE TABLE join_users (id INT64 PRIMARY KEY, name TEXT, body TEXT)") + .expect("create join_users"); + db.execute("CREATE TABLE join_profiles (id INT64 PRIMARY KEY, bio TEXT, body TEXT)") + .expect("create join_profiles"); + + let body = "x".repeat(1024); + let mut txn = db.transaction().expect("begin txn"); + let users = txn + .prepare("INSERT INTO join_users (id, name, body) VALUES ($1, $2, $3)") + .expect("prepare users"); + let profiles = txn + .prepare("INSERT INTO join_profiles (id, bio, body) VALUES ($1, $2, $3)") + .expect("prepare profiles"); + for id in 1_i64..=256_i64 { + users + .execute_in( + &mut txn, + &[ + Value::Int64(id), + Value::Text(format!("u{id}")), + Value::Text(body.clone()), + ], + ) + .expect("insert user"); + profiles + .execute_in( + &mut txn, + &[ + Value::Int64(id), + Value::Text(format!("b{id}")), + Value::Text(body.clone()), + ], + ) + .expect("insert profile"); + } + txn.commit().expect("commit rows"); + + { + let runtime = db.inner.engine.read().expect("engine runtime lock"); + for table_name in ["join_users", "join_profiles"] { + let table = runtime + .persisted_tables + .get(table_name) + .expect("persisted table after commit"); + assert!( + table.pointer.is_table_paged_manifest(), + "expected benchmark-shaped table {table_name} to use paged storage" + ); + assert!( + runtime.has_deferred_paged_row_locator_cache_for_tests(table_name), + "expected INT64 primary-key table {table_name} to build a deferred locator cache" + ); + } + } + + let prepared = db + .prepare( + "SELECT u.name, p.bio \ + FROM join_users AS u \ + JOIN join_profiles AS p ON u.id = p.id \ + WHERE u.id = $1", + ) + .expect("prepare join lookup"); + assert!( + prepared.simple_row_id_join_projection.is_some(), + "expected prepared join lookup to cache the row-id join plan" + ); + let result = prepared + .execute(&[Value::Int64(42)]) + .expect("execute prepared join lookup"); + assert_eq!(result.rows().len(), 1); + assert_eq!( + result.rows()[0].values(), + &[ + Value::Text("u42".to_string()), + Value::Text("b42".to_string()) + ] + ); + + let json_after = db + .inspect_storage_state_json() + .expect("json after prepared join lookup"); + assert!( + json_after.contains("\"loaded_table_count\":0"), + "expected prepared join lookup to avoid materialization, got: {json_after}" + ); + assert!( + json_after.contains("\"deferred_table_count\":2"), + "expected both join tables to remain deferred, got: {json_after}" + ); + assert!( + json_after.contains("\"rows_in_memory_count\":0"), + "expected zero resident rows after prepared deferred join lookup, got: {json_after}" + ); + } + #[test] fn disabled_sync_write_does_not_buffer_runtime_mutations() { let tempdir = TempDir::new().expect("tempdir"); diff --git a/crates/decentdb/src/exec/mod.rs b/crates/decentdb/src/exec/mod.rs index 842cd16..e30b02b 100644 --- a/crates/decentdb/src/exec/mod.rs +++ b/crates/decentdb/src/exec/mod.rs @@ -1764,7 +1764,47 @@ pub(crate) struct SimpleRowIdProjectionRequest<'a> { pub(crate) struct ResolvedSimpleRowIdProjectionRequest<'a> { pub(crate) table_name: &'a str, pub(crate) projection_indexes: &'a [usize], - pub(crate) column_names: &'a [String], + pub(crate) column_names: Arc<[String]>, + pub(crate) lookup_row_id: i64, + pub(crate) pager: &'a PagerHandle, + pub(crate) wal: &'a WalHandle, + pub(crate) snapshot_lsn: u64, + pub(crate) use_persistent_pk_index: bool, +} + +pub(crate) struct ResolvedSimpleRowIdRangeProjectionRequest<'a> { + pub(crate) table_name: &'a str, + pub(crate) projection_indexes: &'a [usize], + pub(crate) column_names: Arc<[String]>, + pub(crate) filter_column: &'a str, + pub(crate) lower_bound: Option, + pub(crate) upper_bound: Option, + pub(crate) limit: Option, + pub(crate) pager: &'a PagerHandle, + pub(crate) wal: &'a WalHandle, + pub(crate) snapshot_lsn: u64, + pub(crate) use_persistent_pk_index: bool, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) enum SimpleJoinProjectionSide { + Left, + Right, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) struct ResolvedSimpleJoinProjection { + pub(crate) side: SimpleJoinProjectionSide, + pub(crate) index: usize, +} + +pub(crate) struct ResolvedSimpleRowIdJoinProjectionRequest<'a> { + pub(crate) left_table_name: &'a str, + pub(crate) right_table_name: &'a str, + pub(crate) left_projection_indexes: &'a [usize], + pub(crate) right_projection_indexes: &'a [usize], + pub(crate) projections: &'a [ResolvedSimpleJoinProjection], + pub(crate) column_names: Arc<[String]>, pub(crate) lookup_row_id: i64, pub(crate) pager: &'a PagerHandle, pub(crate) wal: &'a WalHandle, @@ -1775,7 +1815,7 @@ pub(crate) struct ResolvedSimpleRowIdProjectionRequest<'a> { struct ValidatedSimpleRowIdProjectionRequest<'a> { table_schema: &'a TableSchema, projection_indexes: &'a [usize], - column_names: Vec, + column_names: Arc<[String]>, lookup_row_id: i64, pager: &'a PagerHandle, wal: &'a WalHandle, @@ -2096,15 +2136,31 @@ impl EngineRuntime { } let chunk_payloads = read_paged_table_chunk_payloads(store, state)?; + self.refresh_paged_lookup_cache_and_pk_index_from_chunks( + db, + table_name, + state, + &chunk_payloads, + ) + } + + fn refresh_paged_lookup_cache_and_pk_index_from_chunks( + &mut self, + db: &crate::db::Db, + table_name: &str, + state: PersistedTableState, + chunk_payloads: &[TablePageManifestChunk], + ) -> Result> { + let needs_locator_cache = self.should_cache_deferred_paged_row_locators(table_name); if needs_locator_cache { - self.cache_deferred_paged_row_locators(table_name, state, &chunk_payloads)?; + self.cache_deferred_paged_row_locators(table_name, state, chunk_payloads)?; } else { self.deferred_paged_row_locator_caches_mut() .remove(table_name); } if db.config().persistent_pk_index { - build_persistent_pk_index_root_from_chunk_payloads(db, &chunk_payloads) + build_persistent_pk_index_root_from_chunk_payloads(db, chunk_payloads) } else { Ok(None) } @@ -2550,28 +2606,80 @@ impl EngineRuntime { .copied() .unwrap_or_default(); let previous_pointer = previous_state.pointer; - let use_paged_row_storage = + let row_source = + self.tables + .get(&canonical_table_name) + .cloned() + .ok_or_else(|| { + DbError::internal(format!("table data for {table_name} is missing")) + })?; + let mut use_paged_row_storage = db.config().paged_row_storage || previous_pointer.is_table_paged_manifest(); + if db.config().paged_row_storage && !previous_pointer.is_table_paged_manifest() { + use_paged_row_storage = match &row_source { + TableRowSource::Resident(data) => resident_table_should_use_paged_storage( + data, + previous_state, + &delta, + db.config().page_size, + )?, + TableRowSource::Paged(manifest) => { + manifest.chunks.len() > 1 + || manifest.chunks.first().is_some_and(|chunk| { + chunk.payload.len() + > paged_table_target_chunk_bytes(db.config().page_size) + }) + } + }; + } let cached_payload = if !use_paged_row_storage || !previous_pointer.is_table_paged_manifest() { self.take_cached_payload(&canonical_table_name) } else { None }; - let row_source = self.tables.get(&canonical_table_name).ok_or_else(|| { - DbError::internal(format!("table data for {table_name} is missing")) - })?; if let Some(manifest) = row_source.paged_manifest() { self.overflow_chain_caches.remove(&canonical_table_name); - let new_state = + if delta.append_count > 0 + && delta.updated_rows.is_empty() + && delta.deleted_rows.is_empty() + && !db.config().persistent_pk_index + { + if let Some((new_state, persisted_chunks)) = + try_append_only_paged_table_from_manifest( + &mut store, + previous_state, + manifest, + )? + { + self.persisted_tables_mut() + .insert(canonical_table_name.clone(), new_state); + let pk_index_root = self + .refresh_paged_lookup_cache_and_pk_index_from_chunks( + db, + &canonical_table_name, + new_state, + &persisted_chunks, + )?; + replace_table_pk_index_root( + self, + db, + &canonical_table_name, + pk_index_root, + )?; + self.cache_payload_remove(&canonical_table_name); + continue; + } + } + let (new_state, persisted_chunks) = rewrite_paged_table_from_manifest(&mut store, previous_state, manifest)?; self.persisted_tables_mut() .insert(canonical_table_name.clone(), new_state); - let pk_index_root = self.refresh_paged_lookup_cache_and_pk_index( + let pk_index_root = self.refresh_paged_lookup_cache_and_pk_index_from_chunks( db, - &store, &canonical_table_name, new_state, + &persisted_chunks, )?; replace_table_pk_index_root(self, db, &canonical_table_name, pk_index_root)?; self.cache_payload_remove(&canonical_table_name); @@ -3545,6 +3653,18 @@ impl EngineRuntime { } } + pub(crate) fn has_redeferable_persisted_tables(&self, names: &[&str]) -> bool { + names.iter().any(|name| { + let Some(table_name) = self.canonical_catalog_table_name(name) else { + return false; + }; + self.persisted_tables + .get(&table_name) + .is_some_and(|state| state.pointer.is_table_paged_manifest()) + && self.tables.contains_key(&table_name) + }) + } + pub(crate) fn redefer_all_persisted_paged_tables(&mut self) { let paged_names: Vec = self .persisted_tables @@ -5085,7 +5205,7 @@ impl EngineRuntime { render_simple_grouped_count_groups(self, groups, plan, params) } - fn try_execute_simple_grouped_numeric_aggregate_query( + pub(crate) fn try_execute_simple_grouped_numeric_aggregate_query( &self, query: &Query, params: &[Value], @@ -5850,8 +5970,22 @@ impl EngineRuntime { plan.aggregate_bindings.len() ]; - visit_persisted_table_rows(store, state, |_, values| { - if compare_values(&values[filter_column_index], &filter_value)? + let mut projection_indexes = Vec::with_capacity(plan.aggregate_bindings.len() + 1); + projection_indexes.push(filter_column_index); + let filter_projection_index = 0; + let mut aggregate_projection_indexes = Vec::with_capacity(plan.aggregate_bindings.len()); + for aggregate in &plan.aggregate_bindings { + let Some(source_column_index) = aggregate.source_column_index else { + aggregate_projection_indexes.push(None); + continue; + }; + let projection_index = + push_projection_index(&mut projection_indexes, source_column_index); + aggregate_projection_indexes.push(Some(projection_index)); + } + + visit_persisted_table_projected_values(store, state, &projection_indexes, |_, values| { + if compare_values(&values[filter_projection_index], &filter_value)? != std::cmp::Ordering::Equal { return Ok(()); @@ -5859,10 +5993,11 @@ impl EngineRuntime { row_count += 1; for (aggregate_index, aggregate) in plan.aggregate_bindings.iter().enumerate() { if matches!(aggregate.kind, SimpleGroupedNumericAggregateKind::Sum) { - let source_column_index = aggregate.source_column_index.ok_or_else(|| { - DbError::internal("simple scalar SUM missing source column") - })?; - numeric_states[aggregate_index].add(&values[source_column_index])?; + let source_projection_index = aggregate_projection_indexes[aggregate_index] + .ok_or_else(|| { + DbError::internal("simple scalar SUM missing source column") + })?; + numeric_states[aggregate_index].add(&values[source_projection_index])?; } } Ok(()) @@ -6172,7 +6307,7 @@ impl EngineRuntime { )) } - fn try_execute_simple_deferred_paged_grouped_numeric_aggregate_query( + pub(crate) fn try_execute_simple_deferred_paged_grouped_numeric_aggregate_query( &self, query: &Query, params: &[Value], @@ -9784,18 +9919,19 @@ impl EngineRuntime { return Ok(None); } probe_steps = probe_steps.saturating_add(1); - if let Some(row) = read_deferred_stored_row_by_id( + if let Some(values) = read_deferred_projected_values_by_id( store, state, table_schema, row_id, use_persistent_pk_index, paged_locator_cache, + projection_indexes, )? { if skipped < offset { skipped += 1; } else { - rows.push(project_simple_projection_row(&row, projection_indexes)); + rows.push(QueryRow::new(values)); } } let Some(next_row_id) = row_id.checked_add(1) else { @@ -11256,7 +11392,7 @@ impl EngineRuntime { ValidatedSimpleRowIdProjectionRequest { table_schema, projection_indexes: &projection_indexes, - column_names, + column_names: Arc::from(column_names), lookup_row_id: request.lookup_row_id, pager: request.pager, wal: request.wal, @@ -11294,7 +11430,7 @@ impl EngineRuntime { ValidatedSimpleRowIdProjectionRequest { table_schema, projection_indexes: request.projection_indexes, - column_names: request.column_names.to_vec(), + column_names: Arc::clone(&request.column_names), lookup_row_id: request.lookup_row_id, pager: request.pager, wal: request.wal, @@ -11304,6 +11440,257 @@ impl EngineRuntime { ) } + pub(crate) fn execute_resolved_simple_row_id_range_projection_at_snapshot( + &self, + request: ResolvedSimpleRowIdRangeProjectionRequest<'_>, + ) -> Result> { + if self + .visible_view(request.table_name, NameResolutionScope::Session) + .is_some() + || self.visible_table_is_temporary(request.table_name) + { + return Ok(None); + } + let Some(table_schema) = self.table_schema(request.table_name) else { + return Ok(None); + }; + if !generated_columns_are_stored(table_schema) { + return Ok(None); + } + if request + .projection_indexes + .iter() + .any(|index| *index >= table_schema.columns.len()) + { + return Ok(None); + } + let Some(filter_column_index) = schema_column_index(table_schema, request.filter_column) + else { + return Ok(None); + }; + if !table_schema + .primary_key_columns + .iter() + .any(|column| identifiers_equal(column, request.filter_column)) + || table_schema.columns[filter_column_index].column_type != ColumnType::Int64 + { + return Ok(None); + } + + let canonical_table_name = table_schema.name.as_str(); + let no_alias = None; + if let Some(row_source) = self.visible_table_row_source(canonical_table_name) { + return self.try_simple_rowid_range_projection_result( + row_source, + table_schema, + TableBindingRef { + name: canonical_table_name, + alias: &no_alias, + }, + request.filter_column, + request.lower_bound.as_ref(), + request.upper_bound.as_ref(), + request.projection_indexes, + request.column_names.to_vec(), + &[], + request.limit, + 0, + ); + } + + if !self.has_deferred_tables() + || !self + .deferred_table_names() + .any(|candidate| identifiers_equal(candidate, canonical_table_name)) + { + return Ok(None); + } + let Some(state) = self.persisted_table_state(canonical_table_name) else { + return Ok(None); + }; + let store = SnapshotPageStore { + pager: request.pager, + wal: request.wal, + snapshot_lsn: request.snapshot_lsn, + }; + let paged_locator_cache = self + .catalog + .table(canonical_table_name) + .and_then(|table| self.deferred_paged_row_locator_caches.get(&table.name)) + .map(|cache| cache.as_ref()); + self.try_simple_deferred_rowid_range_projection_result( + &store, + state, + table_schema, + TableBindingRef { + name: canonical_table_name, + alias: &no_alias, + }, + request.filter_column, + request.lower_bound.as_ref(), + request.upper_bound.as_ref(), + request.projection_indexes, + request.column_names.to_vec(), + &[], + request.limit, + 0, + request.use_persistent_pk_index, + paged_locator_cache, + ) + } + + pub(crate) fn execute_resolved_simple_row_id_join_projection_at_snapshot( + &self, + request: ResolvedSimpleRowIdJoinProjectionRequest<'_>, + ) -> Result> { + if self + .visible_view(request.left_table_name, NameResolutionScope::Session) + .is_some() + || self + .visible_view(request.right_table_name, NameResolutionScope::Session) + .is_some() + || self.visible_table_is_temporary(request.left_table_name) + || self.visible_table_is_temporary(request.right_table_name) + { + return Ok(None); + } + let Some(left_schema) = self.table_schema(request.left_table_name) else { + return Ok(None); + }; + let Some(right_schema) = self.table_schema(request.right_table_name) else { + return Ok(None); + }; + if !generated_columns_are_stored(left_schema) || !generated_columns_are_stored(right_schema) + { + return Ok(None); + } + if request + .projections + .iter() + .any(|projection| match projection.side { + SimpleJoinProjectionSide::Left => { + projection.index >= request.left_projection_indexes.len() + } + SimpleJoinProjectionSide::Right => { + projection.index >= request.right_projection_indexes.len() + } + }) + || request + .left_projection_indexes + .iter() + .any(|index| *index >= left_schema.columns.len()) + || request + .right_projection_indexes + .iter() + .any(|index| *index >= right_schema.columns.len()) + { + return Ok(None); + } + + let left_name = left_schema.name.as_str(); + let right_name = right_schema.name.as_str(); + if let (Some(left_source), Some(right_source)) = ( + self.visible_table_row_source(left_name), + self.visible_table_row_source(right_name), + ) { + let Some(left_row) = left_source.row_by_id(request.lookup_row_id)? else { + return Ok(Some(QueryResult::with_shared_columns( + Arc::clone(&request.column_names), + Vec::new(), + ))); + }; + let Some(right_row) = right_source.row_by_id(request.lookup_row_id)? else { + return Ok(Some(QueryResult::with_shared_columns( + Arc::clone(&request.column_names), + Vec::new(), + ))); + }; + let row = project_resolved_simple_join_row_from_full_values( + request.projections, + request.left_projection_indexes, + request.right_projection_indexes, + left_row.values(), + right_row.values(), + )?; + return Ok(Some(QueryResult::with_shared_columns( + Arc::clone(&request.column_names), + vec![row], + ))); + } + + let Some(left_state) = self.persisted_table_state(left_name) else { + return Ok(None); + }; + let Some(right_state) = self.persisted_table_state(right_name) else { + return Ok(None); + }; + let left_cache = self + .catalog + .table(left_name) + .and_then(|table| self.deferred_paged_row_locator_caches.get(&table.name)) + .map(|cache| cache.as_ref()); + let right_cache = self + .catalog + .table(right_name) + .and_then(|table| self.deferred_paged_row_locator_caches.get(&table.name)) + .map(|cache| cache.as_ref()); + if !deferred_rowid_lookup_available( + left_state, + left_schema, + request.use_persistent_pk_index, + left_cache, + ) || !deferred_rowid_lookup_available( + right_state, + right_schema, + request.use_persistent_pk_index, + right_cache, + ) { + return Ok(None); + } + + let store = SnapshotPageStore { + pager: request.pager, + wal: request.wal, + snapshot_lsn: request.snapshot_lsn, + }; + let Some(left_values) = read_deferred_projected_values_by_id( + &store, + left_state, + left_schema, + request.lookup_row_id, + request.use_persistent_pk_index, + left_cache, + request.left_projection_indexes, + )? + else { + return Ok(Some(QueryResult::with_shared_columns( + Arc::clone(&request.column_names), + Vec::new(), + ))); + }; + let Some(right_values) = read_deferred_projected_values_by_id( + &store, + right_state, + right_schema, + request.lookup_row_id, + request.use_persistent_pk_index, + right_cache, + request.right_projection_indexes, + )? + else { + return Ok(Some(QueryResult::with_shared_columns( + Arc::clone(&request.column_names), + Vec::new(), + ))); + }; + let row = + project_resolved_simple_join_row(request.projections, &left_values, &right_values)?; + Ok(Some(QueryResult::with_shared_columns( + Arc::clone(&request.column_names), + vec![row], + ))) + } + fn execute_validated_simple_row_id_projection_at_snapshot( &self, request: ValidatedSimpleRowIdProjectionRequest<'_>, @@ -11320,7 +11707,10 @@ impl EngineRuntime { )] }) .unwrap_or_default(); - return Ok(Some(QueryResult::with_rows(request.column_names, rows))); + return Ok(Some(QueryResult::with_shared_columns( + Arc::clone(&request.column_names), + rows, + ))); } if !self.has_deferred_tables() @@ -11343,22 +11733,21 @@ impl EngineRuntime { wal: request.wal, snapshot_lsn: request.snapshot_lsn, }; - let rows = read_deferred_stored_row_by_id( + let rows = read_deferred_projected_values_by_id( &store, state, table_schema, request.lookup_row_id, request.use_persistent_pk_index, paged_locator_cache, + request.projection_indexes, )? - .map(|stored_row| { - vec![project_simple_projection_owned_row( - stored_row, - request.projection_indexes, - )] - }) + .map(|values| vec![QueryRow::new(values)]) .unwrap_or_default(); - Ok(Some(QueryResult::with_rows(request.column_names, rows))) + Ok(Some(QueryResult::with_shared_columns( + Arc::clone(&request.column_names), + rows, + ))) } fn try_execute_simple_deferred_rowid_join_projection_query( @@ -18710,6 +19099,51 @@ fn encode_table_payload(data: &TableData) -> Result> { Ok(output) } +fn encoded_table_row_len(row: &StoredRow, scratch: &mut Vec) -> Result { + scratch.clear(); + Row::encode_values_into(&row.values, scratch)?; + Ok(8usize.saturating_add(4).saturating_add(scratch.len())) +} + +fn resident_table_should_use_paged_storage( + data: &TableData, + previous_state: PersistedTableState, + delta: &PagedMutationDelta, + page_size: u32, +) -> Result { + if data.rows.is_empty() { + return Ok(false); + } + + let target_chunk_bytes = paged_table_target_chunk_bytes(page_size); + if previous_state.pointer.head_page_id != 0 + && previous_state.pointer.logical_len as usize > target_chunk_bytes + { + return Ok(true); + } + + let append_only = + delta.append_count > 0 && delta.updated_rows.is_empty() && delta.deleted_rows.is_empty(); + let mut encoded_len = if append_only && previous_state.pointer.head_page_id != 0 { + previous_state.pointer.logical_len as usize + } else { + TABLE_PAYLOAD_MAGIC.len() + 4 + }; + let start = if append_only && previous_state.pointer.head_page_id != 0 { + data.rows.len().saturating_sub(delta.append_count) + } else { + 0 + }; + let mut scratch = Vec::with_capacity(64); + for row in &data.rows[start..] { + encoded_len = encoded_len.saturating_add(encoded_table_row_len(row, &mut scratch)?); + if encoded_len > target_chunk_bytes { + return Ok(true); + } + } + Ok(false) +} + fn paged_table_target_chunk_bytes(page_size: u32) -> usize { (page_size as usize) .saturating_mul(PAGED_TABLE_TARGET_CHUNK_PAGES) @@ -18968,6 +19402,43 @@ where Ok(row_count) } +fn visit_table_payload_projected_values_from_bytes( + bytes: &[u8], + projection_indexes: &[usize], + tombstoned_row_ids: Option<&[i64]>, + visitor: &mut F, +) -> Result +where + F: FnMut(i64, &[Value]) -> Result<()>, +{ + if bytes.is_empty() { + return Ok(0); + } + let mut cursor = Cursor::new(bytes); + let magic = cursor.read_slice(TABLE_PAYLOAD_MAGIC.len())?; + if magic != TABLE_PAYLOAD_MAGIC { + return Err(DbError::corruption("table payload magic is invalid")); + } + let row_count = cursor.read_u32()? as usize; + let mut visible_count = 0usize; + for _ in 0..row_count { + let row_id = cursor.read_i64()?; + let row_bytes_len = cursor.read_u32()? as usize; + let row_bytes = cursor.read_slice(row_bytes_len)?; + if tombstoned_row_ids.is_some_and(|row_ids| row_ids.contains(&row_id)) { + continue; + } + let values = Row::decode_projection_with_overflow::( + row_bytes, + None, + projection_indexes, + )?; + visitor(row_id, &values)?; + visible_count += 1; + } + Ok(visible_count) +} + fn visit_table_payload_rows_from_pointer( store: &S, pointer: OverflowPointer, @@ -19001,6 +19472,49 @@ where Ok(row_count) } +fn visit_table_payload_projected_values_from_pointer( + store: &S, + pointer: OverflowPointer, + projection_indexes: &[usize], + visitor: &mut F, +) -> Result +where + F: FnMut(i64, &[Value]) -> Result<()>, +{ + if pointer.head_page_id == 0 || pointer.logical_len == 0 { + return Ok(0); + } + if pointer.is_compressed() { + let payload = read_overflow(store, pointer)?; + return visit_table_payload_projected_values_from_bytes( + &payload, + projection_indexes, + None, + visitor, + ); + } + + let mut cursor = OverflowPayloadCursor::new(store, pointer); + let mut magic = [0_u8; TABLE_PAYLOAD_MAGIC.len()]; + cursor.read_exact(&mut magic)?; + if magic != *TABLE_PAYLOAD_MAGIC { + return Err(DbError::corruption("table payload magic is invalid")); + } + let row_count = cursor.read_u32()? as usize; + for _ in 0..row_count { + let row_id = cursor.read_i64()?; + let row_bytes_len = cursor.read_u32()? as usize; + let row_bytes = cursor.read_vec(row_bytes_len)?; + let values = Row::decode_projection_with_overflow::( + &row_bytes, + None, + projection_indexes, + )?; + visitor(row_id, &values)?; + } + Ok(row_count) +} + fn visit_table_payload_int64_column_from_bytes( bytes: &[u8], column_index: usize, @@ -19138,6 +19652,78 @@ where Ok(total_row_count) } +fn visit_persisted_table_projected_values( + store: &S, + state: PersistedTableState, + projection_indexes: &[usize], + mut visitor: F, +) -> Result +where + F: FnMut(i64, &[Value]) -> Result<()>, +{ + if state.pointer.head_page_id == 0 || state.pointer.logical_len == 0 { + return Ok(0); + } + if !state.pointer.is_table_paged_manifest() { + let row_count = visit_table_payload_projected_values_from_pointer( + store, + state.pointer, + projection_indexes, + &mut visitor, + )?; + if state.row_count != 0 && row_count != state.row_count { + return Err(DbError::corruption("table payload row count mismatch")); + } + return Ok(row_count); + } + + let manifest_payload = read_overflow(store, state.pointer)?; + if crc32c_parts(&[manifest_payload.as_slice()]) != state.checksum { + return Err(DbError::corruption( + "paged table manifest checksum mismatch", + )); + } + let manifest = decode_paged_table_manifest_payload(&manifest_payload)?; + let mut total_row_count = 0usize; + for chunk in manifest.chunks { + let mut count = 0usize; + let tombstones = if chunk.tombstoned_row_ids.is_empty() { + None + } else { + Some(chunk.tombstoned_row_ids.as_slice()) + }; + + let base_payload = read_overflow(store, chunk.pointer)?; + count += visit_table_payload_projected_values_from_bytes( + &base_payload, + projection_indexes, + tombstones, + &mut visitor, + )?; + + if let Some(overlay_pointer) = chunk.overlay_pointer { + let overlay_payload = read_overflow(store, overlay_pointer)?; + count += visit_table_payload_projected_values_from_bytes( + &overlay_payload, + projection_indexes, + None, + &mut visitor, + )?; + } + + if count != chunk.row_count { + return Err(DbError::corruption("paged table chunk row count mismatch")); + } + total_row_count = total_row_count.saturating_add(count); + } + if state.row_count != 0 && total_row_count != state.row_count { + return Err(DbError::corruption( + "paged table manifest row count mismatch", + )); + } + Ok(total_row_count) +} + fn visit_persisted_table_rows( store: &S, state: PersistedTableState, @@ -19496,14 +20082,16 @@ fn wrap_legacy_table_state_as_paged_manifest( fn append_paged_table_chunks( store: &mut S, - previous_state: PersistedTableState, + mut previous_state: PersistedTableState, appended_chunks: &[EncodedPagedTableChunk], row_count: usize, ) -> Result { - if previous_state.pointer.head_page_id == 0 || !previous_state.pointer.is_table_paged_manifest() - { + if previous_state.pointer.head_page_id == 0 { return persist_paged_table(store, previous_state, appended_chunks, row_count); } + if !previous_state.pointer.is_table_paged_manifest() { + previous_state = wrap_legacy_table_state_as_paged_manifest(store, previous_state)?; + } if appended_chunks.is_empty() { return Ok(previous_state); } @@ -19547,6 +20135,179 @@ fn append_paged_table_chunks( }) } +fn persisted_paged_chunk_is_plain(chunk: &PersistedTableChunkState) -> bool { + chunk.tombstoned_row_ids.is_empty() + && chunk.overlay_pointer.is_none() + && chunk.overlay_checksum.is_none() +} + +fn table_page_manifest_chunk_is_plain(chunk: &TablePageManifestChunk) -> bool { + chunk.tombstoned_row_ids.is_empty() + && chunk.overlay_pointer.is_none() + && chunk.overlay_checksum.is_none() + && chunk.overlay_payload.is_none() +} + +fn persisted_chunk_from_current( + pointer: OverflowPointer, + checksum: u32, + row_count: usize, + current_chunk: &TablePageManifestChunk, +) -> TablePageManifestChunk { + TablePageManifestChunk { + pointer, + checksum, + row_count, + payload: Arc::clone(¤t_chunk.payload), + tombstoned_row_ids: Arc::clone(¤t_chunk.tombstoned_row_ids), + overlay_pointer: None, + overlay_checksum: None, + overlay_payload: None, + } +} + +fn try_append_only_paged_table_from_manifest( + store: &mut S, + previous_state: PersistedTableState, + manifest: &TablePageManifest, +) -> Result)>> { + if previous_state.pointer.head_page_id == 0 || !previous_state.pointer.is_table_paged_manifest() + { + return Ok(None); + } + if manifest.chunks.is_empty() { + return Ok(None); + } + + let manifest_payload = read_overflow(store, previous_state.pointer)?; + if crc32c_parts(&[manifest_payload.as_slice()]) != previous_state.checksum { + return Err(DbError::corruption( + "paged table manifest checksum mismatch", + )); + } + let previous_manifest = decode_paged_table_manifest_payload(&manifest_payload)?; + if previous_manifest.chunks.is_empty() || manifest.chunks.len() < previous_manifest.chunks.len() + { + return Ok(None); + } + + let previous_tail_index = previous_manifest.chunks.len() - 1; + let mut first_changed_index = None; + let mut current_checksums = Vec::with_capacity(manifest.chunks.len()); + for (index, current_chunk) in manifest.chunks.iter().enumerate() { + if !table_page_manifest_chunk_is_plain(current_chunk) { + return Ok(None); + } + let checksum = crc32c_parts(&[current_chunk.payload.as_slice()]); + current_checksums.push(checksum); + let Some(previous_chunk) = previous_manifest.chunks.get(index) else { + continue; + }; + if !persisted_paged_chunk_is_plain(previous_chunk) { + return Ok(None); + } + if previous_chunk.checksum == checksum + && previous_chunk.row_count == current_chunk.row_count + { + continue; + } + if index != previous_tail_index { + return Ok(None); + } + first_changed_index = Some(index); + } + + if first_changed_index.is_none() && manifest.chunks.len() == previous_manifest.chunks.len() { + return Ok(None); + } + + let mut new_chunks = Vec::with_capacity(manifest.chunks.len()); + let mut persisted_chunks = Vec::with_capacity(manifest.chunks.len()); + + for (index, current_chunk) in manifest.chunks.iter().enumerate() { + let checksum = current_checksums[index]; + if let Some(previous_chunk) = previous_manifest.chunks.get(index) { + if previous_chunk.checksum == checksum + && previous_chunk.row_count == current_chunk.row_count + { + new_chunks.push(previous_chunk.clone()); + persisted_chunks.push(persisted_chunk_from_current( + previous_chunk.pointer, + previous_chunk.checksum, + previous_chunk.row_count, + current_chunk, + )); + continue; + } + + let pointer = rewrite_overflow( + store, + previous_chunk.pointer, + current_chunk.payload.as_slice(), + CompressionMode::Never, + )?; + new_chunks.push(PersistedTableChunkState { + pointer, + checksum, + row_count: current_chunk.row_count, + tombstoned_row_ids: Vec::new(), + overlay_pointer: None, + overlay_checksum: None, + }); + persisted_chunks.push(persisted_chunk_from_current( + pointer, + checksum, + current_chunk.row_count, + current_chunk, + )); + continue; + } + + let pointer = write_overflow( + store, + current_chunk.payload.as_slice(), + CompressionMode::Never, + )?; + new_chunks.push(PersistedTableChunkState { + pointer, + checksum, + row_count: current_chunk.row_count, + tombstoned_row_ids: Vec::new(), + overlay_pointer: None, + overlay_checksum: None, + }); + persisted_chunks.push(persisted_chunk_from_current( + pointer, + checksum, + current_chunk.row_count, + current_chunk, + )); + } + + let updated_manifest_payload = + encode_paged_table_manifest_payload(&PersistedPagedTableManifest { chunks: new_chunks })?; + let checksum = crc32c_parts(&[updated_manifest_payload.as_slice()]); + let pointer = rewrite_overflow( + store, + previous_state.pointer.with_table_paged_manifest(false), + &updated_manifest_payload, + CompressionMode::Never, + )? + .with_table_paged_manifest(true); + let tail = read_uncompressed_overflow_tail(store, pointer)?.unwrap_or_default(); + + Ok(Some(( + PersistedTableState { + pointer, + checksum, + row_count: manifest.row_count(), + tail, + pk_index_root: previous_state.pk_index_root, + }, + persisted_chunks, + ))) +} + fn compact_paged_table_state_for_checkpoint( store: &mut S, state: PersistedTableState, @@ -19871,22 +20632,25 @@ fn rewrite_paged_table_from_manifest( store: &mut S, previous_state: PersistedTableState, manifest: &TablePageManifest, -) -> Result { +) -> Result<(PersistedTableState, Vec)> { if manifest.chunks.is_empty() { if previous_state.pointer.head_page_id != 0 { free_persisted_table_bytes(store, previous_state)?; } - return Ok(PersistedTableState { - pointer: OverflowPointer { - head_page_id: 0, - logical_len: 0, - flags: 0, + return Ok(( + PersistedTableState { + pointer: OverflowPointer { + head_page_id: 0, + logical_len: 0, + flags: 0, + }, + checksum: 0, + row_count: 0, + tail: OverflowTailInfo::default(), + pk_index_root: previous_state.pk_index_root, }, - checksum: 0, - row_count: 0, - tail: OverflowTailInfo::default(), - pk_index_root: previous_state.pk_index_root, - }); + Vec::new(), + )); } let previous_chunks = if previous_state.pointer.head_page_id != 0 @@ -19916,6 +20680,7 @@ fn rewrite_paged_table_from_manifest( let mut reused_previous = vec![false; reusable_previous.len()]; let mut new_chunks = Vec::with_capacity(manifest.chunks.len()); + let mut persisted_chunks = Vec::with_capacity(manifest.chunks.len()); for current_chunk in manifest.chunks.iter() { let checksum = crc32c_parts(&[current_chunk.payload.as_slice()]); let _overlay_checksum = current_chunk @@ -19945,19 +20710,31 @@ fn rewrite_paged_table_from_manifest( }); if let Some(index) = reused_index { reused_previous[index] = true; - new_chunks.push(reusable_previous[index].0.clone()); + let chunk_state = reusable_previous[index].0.clone(); + persisted_chunks.push(TablePageManifestChunk { + pointer: chunk_state.pointer, + checksum: chunk_state.checksum, + row_count: chunk_state.row_count, + payload: Arc::clone(¤t_chunk.payload), + tombstoned_row_ids: Arc::clone(¤t_chunk.tombstoned_row_ids), + overlay_pointer: chunk_state.overlay_pointer, + overlay_checksum: chunk_state.overlay_checksum, + overlay_payload: current_chunk.overlay_payload.clone(), + }); + new_chunks.push(chunk_state); continue; } let pointer = write_overflow(store, ¤t_chunk.payload, CompressionMode::Never)?; - let overlay = if let Some(overlay_payload) = ¤t_chunk.overlay_payload { - let overlay_pointer = - write_overflow(store, overlay_payload.as_slice(), CompressionMode::Never)?; - let overlay_checksum = crc32c_parts(&[overlay_payload.as_slice()]); - Some((overlay_pointer, overlay_checksum)) - } else { - None - }; + let (overlay_pointer, overlay_checksum) = + if let Some(overlay_payload) = ¤t_chunk.overlay_payload { + let overlay_pointer = + write_overflow(store, overlay_payload.as_slice(), CompressionMode::Never)?; + let overlay_checksum = crc32c_parts(&[overlay_payload.as_slice()]); + (Some(overlay_pointer), Some(overlay_checksum)) + } else { + (None, None) + }; let base_physical = read_table_payload_row_count_from_bytes(¤t_chunk.payload)?; let overlay_physical = current_chunk .overlay_payload @@ -19972,8 +20749,18 @@ fn rewrite_paged_table_from_manifest( checksum, row_count: visible, tombstoned_row_ids: current_chunk.tombstoned_row_ids.iter().copied().collect(), - overlay_pointer: overlay.map(|(p, _)| p), - overlay_checksum: overlay.map(|(_, c)| c), + overlay_pointer, + overlay_checksum, + }); + persisted_chunks.push(TablePageManifestChunk { + pointer, + checksum, + row_count: visible, + payload: Arc::clone(¤t_chunk.payload), + tombstoned_row_ids: Arc::clone(¤t_chunk.tombstoned_row_ids), + overlay_pointer, + overlay_checksum, + overlay_payload: current_chunk.overlay_payload.clone(), }); } @@ -20001,13 +20788,16 @@ fn rewrite_paged_table_from_manifest( } } - Ok(PersistedTableState { - pointer, - checksum, - row_count: manifest.row_count(), - tail, - pk_index_root: previous_state.pk_index_root, - }) + Ok(( + PersistedTableState { + pointer, + checksum, + row_count: manifest.row_count(), + tail, + pk_index_root: previous_state.pk_index_root, + }, + persisted_chunks, + )) } fn build_persistent_pk_index_root(db: &crate::db::Db, payload: &[u8]) -> Result> { @@ -21182,9 +21972,9 @@ struct SimpleRangeBound<'a> { } #[derive(Clone, Debug)] -struct SimpleRangeBoundValue { - inclusive: bool, - value: Value, +pub(crate) struct SimpleRangeBoundValue { + pub(crate) inclusive: bool, + pub(crate) value: Value, } #[derive(Clone, Debug)] @@ -23071,28 +23861,74 @@ fn project_simple_projection_row(stored_row: &StoredRow, projection_indexes: &[u project_simple_projection_values(&stored_row.values, projection_indexes) } -fn project_simple_projection_owned_row( - stored_row: StoredRow, +fn project_simple_projection_value_vec( + values: &[Value], projection_indexes: &[usize], -) -> QueryRow { - if projection_indexes.len() == stored_row.values.len() - && projection_indexes - .iter() - .copied() - .enumerate() - .all(|(expected, actual)| expected == actual) - { - return QueryRow::new(stored_row.values); +) -> Vec { + let mut projected = Vec::with_capacity(projection_indexes.len()); + for index in projection_indexes { + projected.push(values[*index].clone()); + } + projected +} + +fn push_projection_index(indexes: &mut Vec, index: usize) -> usize { + if let Some(position) = indexes.iter().position(|candidate| *candidate == index) { + position + } else { + indexes.push(index); + indexes.len() - 1 } - project_simple_projection_values(&stored_row.values, projection_indexes) } fn project_simple_projection_values(values: &[Value], projection_indexes: &[usize]) -> QueryRow { - let mut projected = Vec::with_capacity(projection_indexes.len()); - for index in projection_indexes { - projected.push(values[*index].clone()); + QueryRow::new(project_simple_projection_value_vec( + values, + projection_indexes, + )) +} + +fn project_resolved_simple_join_row( + projections: &[ResolvedSimpleJoinProjection], + left_values: &[Value], + right_values: &[Value], +) -> Result { + let mut projected = Vec::with_capacity(projections.len()); + for projection in projections { + let values = match projection.side { + SimpleJoinProjectionSide::Left => left_values, + SimpleJoinProjectionSide::Right => right_values, + }; + let value = values + .get(projection.index) + .ok_or_else(|| DbError::internal("prepared join projection index out of bounds"))?; + projected.push(value.clone()); + } + Ok(QueryRow::new(projected)) +} + +fn project_resolved_simple_join_row_from_full_values( + projections: &[ResolvedSimpleJoinProjection], + left_projection_indexes: &[usize], + right_projection_indexes: &[usize], + left_values: &[Value], + right_values: &[Value], +) -> Result { + let mut projected = Vec::with_capacity(projections.len()); + for projection in projections { + let (projection_indexes, values) = match projection.side { + SimpleJoinProjectionSide::Left => (left_projection_indexes, left_values), + SimpleJoinProjectionSide::Right => (right_projection_indexes, right_values), + }; + let original_index = projection_indexes + .get(projection.index) + .ok_or_else(|| DbError::internal("prepared join projection index out of bounds"))?; + let value = values + .get(*original_index) + .ok_or_else(|| DbError::internal("prepared join projection index out of bounds"))?; + projected.push(value.clone()); } - QueryRow::new(projected) + Ok(QueryRow::new(projected)) } fn covering_projection_offsets( @@ -23834,6 +24670,22 @@ fn decode_row_by_locator_from_payload( }) } +fn decode_projected_values_by_locator_from_payload( + store: Option<&S>, + payload: &[u8], + locator: RowLocatorV1, + projection_indexes: &[usize], +) -> Result> { + let start = locator.byte_offset as usize; + let end = start + .checked_add(locator.byte_len as usize) + .ok_or_else(|| DbError::corruption("row locator exceeded payload length"))?; + let row_bytes = payload + .get(start..end) + .ok_or_else(|| DbError::corruption("row locator exceeded payload length"))?; + Row::decode_projection_with_overflow(row_bytes, store, projection_indexes) +} + fn read_deferred_row_by_locator_from_table_payload( store: &S, state: PersistedTableState, @@ -23915,6 +24767,30 @@ fn read_deferred_row_by_cached_paged_locator( decode_row_by_locator_from_payload(payload, row_id, cached.locator).map(Some) } +fn read_deferred_projected_values_by_cached_paged_locator( + store: &S, + cached: CachedPagedRowLocator, + verified_payload: Option<&[u8]>, + projection_indexes: &[usize], +) -> Result> { + let owned_payload; + let payload = if let Some(payload) = verified_payload { + payload + } else { + owned_payload = read_overflow(store, cached.pointer)?; + if crc32c_parts(&[owned_payload.as_slice()]) != cached.checksum { + return Err(DbError::corruption("paged table chunk checksum mismatch")); + } + owned_payload.as_slice() + }; + decode_projected_values_by_locator_from_payload( + Some(store), + payload, + cached.locator, + projection_indexes, + ) +} + fn read_deferred_row_by_id_from_paged_chunk( store: &S, chunk: &PersistedTableChunkState, @@ -24108,6 +24984,44 @@ fn read_deferred_stored_row_by_id( read_deferred_row_by_id_from_table_payload(store, state, row_id) } +fn read_deferred_projected_values_by_id( + store: &S, + state: PersistedTableState, + table_schema: &TableSchema, + row_id: i64, + use_persistent_pk_index: bool, + paged_locator_cache: Option<&DeferredPagedRowLocatorCache>, + projection_indexes: &[usize], +) -> Result>> { + if state.pointer.is_table_paged_manifest() { + if let Some(cache) = paged_locator_cache.filter(|cache| cache.matches_state(state)) { + return cache + .locators + .get(&row_id) + .copied() + .map(|cached| { + read_deferred_projected_values_by_cached_paged_locator( + store, + cached, + cache.verified_payload(cached.pointer, cached.checksum), + projection_indexes, + ) + }) + .transpose(); + } + } + + read_deferred_stored_row_by_id( + store, + state, + table_schema, + row_id, + use_persistent_pk_index, + paged_locator_cache, + ) + .map(|row| row.map(|row| project_simple_projection_value_vec(&row.values, projection_indexes))) +} + fn deferred_rowid_lookup_available( state: PersistedTableState, table_schema: &TableSchema, @@ -33627,10 +34541,10 @@ mod tests { encode_paged_table_chunks, encode_paged_table_chunks_from_rows, encode_runtime_payload, encode_table_payload, like_match, persist_paged_table, read_deferred_row_by_id_from_table_payload, read_table_page_manifest_from_state, - rewrite_paged_table_from_resident, simple_trigram_lookup, ColumnBinding, Dataset, - DbTxnPageStore, EngineRuntime, OverflowPointer, PersistedTableState, RuntimeBtreeKeys, - RuntimeIndex, StoredRow, TableData, TablePageManifest, TablePageManifestChunk, - TableRowSource, + rewrite_paged_table_from_resident, simple_trigram_lookup, + try_append_only_paged_table_from_manifest, ColumnBinding, Dataset, DbTxnPageStore, + EngineRuntime, OverflowPointer, PersistedTableState, RuntimeBtreeKeys, RuntimeIndex, + StoredRow, TableData, TablePageManifest, TablePageManifestChunk, TableRowSource, }; const PAGE_SIZE: u32 = 4096; @@ -36487,6 +37401,91 @@ mod tests { ); } + #[test] + fn append_only_paged_manifest_rewrite_preserves_untouched_chunk_pointers() { + let body = "x".repeat(2048); + let mut store = InMemoryPageStore::new(PAGE_SIZE); + let initial = TableData::from_rows( + (0_i64..96_i64) + .map(|row_id| StoredRow { + row_id: row_id + 1, + values: vec![Value::Int64(row_id), Value::Text(body.clone())], + }) + .collect(), + ); + let initial_chunks = + encode_paged_table_chunks(&initial, PAGE_SIZE).expect("encode initial chunks"); + let initial_state = persist_paged_table( + &mut store, + PersistedTableState::default(), + &initial_chunks, + initial.rows.len(), + ) + .expect("persist initial paged table"); + let initial_manifest_payload = + crate::record::overflow::read_overflow(&store, initial_state.pointer) + .expect("read initial manifest"); + let initial_manifest = decode_paged_table_manifest_payload(&initial_manifest_payload) + .expect("decode initial manifest"); + assert!( + initial_manifest.chunks.len() > 2, + "expected multiple chunks to observe pointer preservation" + ); + let untouched_pointers = initial_manifest + .chunks + .iter() + .take(initial_manifest.chunks.len() - 1) + .map(|chunk| chunk.pointer) + .collect::>(); + + let mut page_manifest = + read_table_page_manifest_from_state(&store, initial_state).expect("read manifest"); + for row_id in 96_i64..144_i64 { + page_manifest + .append_row( + &StoredRow { + row_id: row_id + 1, + values: vec![Value::Int64(row_id), Value::Text(body.clone())], + }, + PAGE_SIZE, + ) + .expect("append row to paged manifest"); + } + + let (appended_state, persisted_chunks) = + try_append_only_paged_table_from_manifest(&mut store, initial_state, &page_manifest) + .expect("append-only paged manifest rewrite") + .expect("append-only path should handle tail rewrite and new chunks"); + let appended_manifest_payload = + crate::record::overflow::read_overflow(&store, appended_state.pointer) + .expect("read appended manifest"); + let appended_manifest = decode_paged_table_manifest_payload(&appended_manifest_payload) + .expect("decode appended manifest"); + let appended_page_manifest = + read_table_page_manifest_from_state(&store, appended_state).expect("read appended"); + let preserved_untouched = appended_manifest + .chunks + .iter() + .take(untouched_pointers.len()) + .map(|chunk| chunk.pointer) + .collect::>(); + let last_row = appended_page_manifest + .row_by_id(144) + .expect("read last row") + .expect("last row should exist"); + + assert_eq!(appended_state.row_count, 144); + assert_eq!(persisted_chunks.len(), appended_manifest.chunks.len()); + assert_eq!( + preserved_untouched, untouched_pointers, + "append-only manifest rewrite should preserve chunks before the previous tail" + ); + assert_eq!( + last_row.values(), + &[Value::Int64(143), Value::Text(body.clone())] + ); + } + #[test] fn rewrite_paged_table_from_resident_preserves_untouched_chunk_pointers() { let body = "x".repeat(2048); diff --git a/crates/decentdb/src/exec/row.rs b/crates/decentdb/src/exec/row.rs index aff6e97..8c982d7 100644 --- a/crates/decentdb/src/exec/row.rs +++ b/crates/decentdb/src/exec/row.rs @@ -23,7 +23,7 @@ impl QueryRow { #[derive(Clone, Debug, PartialEq)] pub struct QueryResult { - columns: Vec, + columns: Arc<[String]>, rows: Vec, affected_rows: u64, explain_lines: Vec, @@ -33,7 +33,7 @@ impl QueryResult { #[must_use] pub fn empty() -> Self { Self { - columns: Vec::new(), + columns: Arc::from([]), rows: Vec::new(), affected_rows: 0, explain_lines: Vec::new(), @@ -42,6 +42,17 @@ impl QueryResult { #[must_use] pub fn with_rows(columns: Vec, rows: Vec) -> Self { + let affected_rows = rows.len() as u64; + Self { + columns: Arc::from(columns), + rows, + affected_rows, + explain_lines: Vec::new(), + } + } + + #[must_use] + pub(crate) fn with_shared_columns(columns: Arc<[String]>, rows: Vec) -> Self { let affected_rows = rows.len() as u64; Self { columns, @@ -54,7 +65,7 @@ impl QueryResult { #[must_use] pub fn with_affected_rows(affected_rows: u64) -> Self { Self { - columns: Vec::new(), + columns: Arc::from([]), rows: Vec::new(), affected_rows, explain_lines: Vec::new(), @@ -64,7 +75,7 @@ impl QueryResult { #[must_use] pub fn with_explain(lines: Vec) -> Self { Self { - columns: vec!["plan".to_string()], + columns: Arc::from(["plan".to_string()]), rows: lines .iter() .cloned() @@ -77,7 +88,7 @@ impl QueryResult { #[must_use] pub fn columns(&self) -> &[String] { - &self.columns + self.columns.as_ref() } #[must_use] diff --git a/crates/decentdb/src/record/row.rs b/crates/decentdb/src/record/row.rs index 8991cea..3bea370 100644 --- a/crates/decentdb/src/record/row.rs +++ b/crates/decentdb/src/record/row.rs @@ -129,6 +129,65 @@ impl Row { Err(DbError::corruption("row field index exceeds field count")) } + pub(crate) fn decode_projection_with_overflow( + bytes: &[u8], + store: Option<&S>, + projection_indexes: &[usize], + ) -> Result> { + let (field_count, mut offset) = decode_varint_u64(bytes)?; + let field_count = usize::try_from(field_count) + .map_err(|_| DbError::corruption("row field count exceeds usize"))?; + if projection_indexes + .iter() + .any(|column_index| *column_index >= field_count) + { + return Err(DbError::corruption("row field index exceeds field count")); + } + + let mut projected: Vec> = vec![None; projection_indexes.len()]; + for field_index in 0..field_count { + let tag = *bytes + .get(offset) + .ok_or_else(|| DbError::corruption("truncated row field tag"))?; + offset += 1; + + let (payload_len, len_bytes) = decode_varint_u64(&bytes[offset..])?; + offset += len_bytes; + let payload_len = usize::try_from(payload_len) + .map_err(|_| DbError::corruption("field payload length exceeds usize"))?; + let payload_end = offset + payload_len; + let payload = bytes + .get(offset..payload_end) + .ok_or_else(|| DbError::corruption("truncated row field payload"))?; + offset = payload_end; + + let mut first_projection_offset: Option = None; + for (projection_offset, projection_index) in projection_indexes.iter().enumerate() { + if *projection_index != field_index { + continue; + } + if let Some(first_projection_offset) = first_projection_offset { + let value = projected[first_projection_offset] + .as_ref() + .ok_or_else(|| DbError::internal("projected row value missing"))? + .clone(); + projected[projection_offset] = Some(value); + } else { + projected[projection_offset] = + Some(Self::decode_value_with_overflow(tag, payload, store)?); + first_projection_offset = Some(projection_offset); + } + } + } + + projected + .into_iter() + .map(|value| { + value.ok_or_else(|| DbError::corruption("projected row field was not decoded")) + }) + .collect() + } + pub(crate) fn encode_with_overflow( &self, store: Option<&mut S>, @@ -519,6 +578,155 @@ impl Row { Ok(Self { values }) } + + fn decode_value_with_overflow( + tag: u8, + payload: &[u8], + store: Option<&S>, + ) -> Result { + match tag { + TAG_NULL => { + if !payload.is_empty() { + return Err(DbError::corruption("NULL field must have empty payload")); + } + Ok(Value::Null) + } + TAG_INT64 => { + let (encoded, consumed) = decode_varint_u64(payload)?; + if consumed != payload.len() { + return Err(DbError::corruption("INT64 payload has trailing bytes")); + } + Ok(Value::Int64(zigzag_decode_u64(encoded))) + } + TAG_FLOAT64 => { + let bytes: [u8; 8] = payload + .try_into() + .map_err(|_| DbError::corruption("FLOAT64 payload must be 8 bytes"))?; + Ok(Value::Float64(f64::from_le_bytes(bytes))) + } + TAG_BOOL => match payload { + [0] => Ok(Value::Bool(false)), + [1] => Ok(Value::Bool(true)), + _ => Err(DbError::corruption("BOOL payload must be 0 or 1")), + }, + TAG_TEXT => Value::text_from_bytes(payload.to_vec()), + TAG_BLOB => Ok(Value::Blob(payload.to_vec())), + TAG_ENUM => { + let (enum_type_id, consumed_a) = decode_varint_u64(payload)?; + let (label_id, consumed_b) = decode_varint_u64(&payload[consumed_a..])?; + if consumed_a + consumed_b != payload.len() { + return Err(DbError::corruption("ENUM payload has trailing bytes")); + } + Ok(Value::Enum { + enum_type_id, + label_id, + }) + } + TAG_IPADDR => { + let (family, addr) = decode_ip_addr_payload(payload)?; + Ok(Value::IpAddr { family, addr }) + } + TAG_CIDR => { + let (family, prefix_len, network) = decode_cidr_payload(payload)?; + Ok(Value::Cidr { + family, + prefix_len, + network, + }) + } + TAG_MACADDR => { + let (len, bytes) = decode_mac_addr_payload(payload)?; + Ok(Value::MacAddr { len, bytes }) + } + TAG_DATE => { + let (encoded, consumed) = decode_varint_u64(payload)?; + if consumed != payload.len() { + return Err(DbError::corruption("DATE payload has trailing bytes")); + } + let value = zigzag_decode_u64(encoded); + let date = i32::try_from(value) + .map_err(|_| DbError::corruption("DATE payload exceeds i32 range"))?; + Ok(Value::DateDays(date)) + } + TAG_TIME => { + let (encoded, consumed) = decode_varint_u64(payload)?; + if consumed != payload.len() { + return Err(DbError::corruption("TIME payload has trailing bytes")); + } + Ok(Value::TimeMicros(zigzag_decode_u64(encoded))) + } + TAG_TIMESTAMP_TZ => { + let (encoded, consumed) = decode_varint_u64(payload)?; + if consumed != payload.len() { + return Err(DbError::corruption( + "TIMESTAMPTZ payload has trailing bytes", + )); + } + Ok(Value::TimestampTzMicros(zigzag_decode_u64(encoded))) + } + TAG_INTERVAL => { + let (months_encoded, consumed_a) = decode_varint_u64(payload)?; + let (days_encoded, consumed_b) = decode_varint_u64(&payload[consumed_a..])?; + let (micros_encoded, consumed_c) = + decode_varint_u64(&payload[consumed_a + consumed_b..])?; + if consumed_a + consumed_b + consumed_c != payload.len() { + return Err(DbError::corruption("INTERVAL payload has trailing bytes")); + } + let months = i32::try_from(zigzag_decode_u64(months_encoded)) + .map_err(|_| DbError::corruption("INTERVAL months exceed i32 range"))?; + let days = i32::try_from(zigzag_decode_u64(days_encoded)) + .map_err(|_| DbError::corruption("INTERVAL days exceed i32 range"))?; + Ok(Value::Interval { + months, + days, + micros: zigzag_decode_u64(micros_encoded), + }) + } + TAG_GEOMETRY => Ok(Value::Geometry(payload.to_vec())), + TAG_GEOGRAPHY => Ok(Value::Geography(payload.to_vec())), + TAG_DECIMAL => { + let scale = *payload + .first() + .ok_or_else(|| DbError::corruption("DECIMAL payload missing scale"))?; + let (encoded, consumed) = decode_varint_u64(&payload[1..])?; + if consumed + 1 != payload.len() { + return Err(DbError::corruption("DECIMAL payload has trailing bytes")); + } + Ok(Value::Decimal { + scaled: zigzag_decode_u64(encoded), + scale, + }) + } + TAG_UUID => { + let bytes: [u8; 16] = payload + .try_into() + .map_err(|_| DbError::corruption("UUID payload must be 16 bytes"))?; + Ok(Value::Uuid(bytes)) + } + TAG_TIMESTAMP => { + let (encoded, consumed) = decode_varint_u64(payload)?; + if consumed != payload.len() { + return Err(DbError::corruption("TIMESTAMP payload has trailing bytes")); + } + Ok(Value::TimestampMicros(zigzag_decode_u64(encoded))) + } + TAG_TEXT_OVERFLOW => { + let store = store.ok_or_else(|| { + DbError::constraint("TEXT overflow decoding requires a page store") + })?; + let pointer = decode_overflow_pointer(payload)?; + Value::text_from_bytes(read_overflow(store, pointer)?) + } + TAG_BLOB_OVERFLOW => { + let store = store.ok_or_else(|| { + DbError::constraint("BLOB overflow decoding requires a page store") + })?; + let pointer = decode_overflow_pointer(payload)?; + Ok(Value::Blob(read_overflow(store, pointer)?)) + } + _ => Err(DbError::corruption(format!("unknown row value tag {tag}"))), + } + } } fn encode_overflow_pointer(pointer: OverflowPointer) -> [u8; 9] { @@ -654,6 +862,33 @@ mod tests { assert!(Row::decode_int64_at(&encoded, 0).is_err()); } + #[test] + fn decode_projection_materializes_requested_columns_in_order() { + let row = Row::new(vec![ + Value::Int64(7), + Value::Text("selected".to_string()), + Value::Bool(true), + ]); + let encoded = row.encode().expect("encode"); + + let projected = + Row::decode_projection_with_overflow::(&encoded, None, &[1, 0, 1]) + .expect("decode projection"); + + assert_eq!( + projected, + vec![ + Value::Text("selected".to_string()), + Value::Int64(7), + Value::Text("selected".to_string()), + ] + ); + assert!( + Row::decode_projection_with_overflow::(&encoded, None, &[3]) + .is_err() + ); + } + #[test] fn large_text_and_blob_values_spill_to_overflow_pages() { let mut store = InMemoryPageStore::default(); diff --git a/crates/decentdb/src/vfs/faulty.rs b/crates/decentdb/src/vfs/faulty.rs index e28277a..7f646f0 100644 --- a/crates/decentdb/src/vfs/faulty.rs +++ b/crates/decentdb/src/vfs/faulty.rs @@ -3,6 +3,7 @@ use std::collections::HashMap; use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex, OnceLock}; use std::thread::ThreadId; @@ -160,6 +161,32 @@ impl VfsFile for FaultyVfsFile { } } + fn write_all_at_many(&self, writes: &[(u64, &[u8])]) -> Result<()> { + if self.state.can_passthrough_batch() { + return self.inner.write_all_at_many(writes); + } + + for (offset, buf) in writes { + let mut cursor = 0; + while cursor < buf.len() { + let written = self.write_at(*offset + cursor as u64, &buf[cursor..])?; + if written == 0 { + return Err(DbError::io( + format!( + "short write on {} at offset {}: expected {} bytes, got {cursor}", + self.path().display(), + *offset + cursor as u64, + buf.len() + ), + std::io::Error::new(std::io::ErrorKind::WriteZero, "short write"), + )); + } + cursor += written; + } + } + Ok(()) + } + fn advise_sequential(&self) -> Result<()> { self.inner.advise_sequential() } @@ -236,6 +263,7 @@ impl VfsFile for FaultyVfsFile { #[derive(Debug, Default)] struct FaultState { + active_failpoints: AtomicUsize, failpoints: Mutex>>, hits: Mutex>, logs: Mutex>, @@ -243,7 +271,20 @@ struct FaultState { } impl FaultState { + fn can_passthrough_batch(&self) -> bool { + if !self.has_active_failpoints() { + return true; + } + if !self.is_owner_thread() { + return true; + } + self.failpoints.lock().expect("fault state lock").is_empty() + } + fn decision(&self, label: &str) -> FaultDecision { + if !self.has_active_failpoints() { + return FaultDecision::Untracked; + } if !self.is_owner_thread() { return FaultDecision::Untracked; } @@ -279,6 +320,10 @@ impl FaultState { FaultDecision::Pass { hit } } + fn has_active_failpoints(&self) -> bool { + self.active_failpoints.load(Ordering::Acquire) != 0 + } + fn log(&self, label: &str, hit: u64, outcome: &str) { if !self.is_owner_thread() { return; @@ -326,6 +371,7 @@ pub(crate) fn install_failpoint(failpoint: Failpoint) -> Result<()> { .entry(failpoint.label.clone()) .or_default() .push(failpoint); + state.active_failpoints.fetch_add(1, Ordering::Release); Ok(()) } @@ -336,6 +382,7 @@ pub(crate) fn clear_failpoints() -> Result<()> { .lock() .map_err(|_| DbError::internal("fault state lock poisoned"))? .clear(); + state.active_failpoints.store(0, Ordering::Release); state .hits .lock() @@ -606,6 +653,38 @@ mod tests { clear_failpoints().expect("clear failpoints"); } + #[test] + fn batched_writes_honor_failpoints() { + let _guard = super::test_failpoint_lock().lock().expect("test lock"); + clear_failpoints().expect("clear failpoints"); + install_failpoint(Failpoint { + label: "db.write_page".to_string(), + trigger_on: 2, + action: FailAction::PartialWrite { bytes: 2 }, + }) + .expect("install partial write"); + + let vfs = FaultyVfs::wrap(Arc::new(crate::vfs::mem::MemVfs::default())); + let file = vfs + .open( + Path::new(":memory:"), + OpenMode::CreateNew, + FileKind::Database, + ) + .expect("create file"); + + file.write_all_at_many(&[(4096, &[1, 2, 3]), (8192, &[4, 5, 6])]) + .expect("batched write retries partial writes"); + let logs = failpoint_logs().expect("read logs"); + assert!( + logs.iter() + .any(|entry| entry.label == "db.write_page" && entry.outcome == "partial_write:2"), + "partial batched write should be logged" + ); + + clear_failpoints().expect("clear failpoints"); + } + #[test] fn classify_functions_return_expected_labels() { let _guard = super::test_failpoint_lock().lock().expect("test lock"); diff --git a/crates/decentdb/src/vfs/stats.rs b/crates/decentdb/src/vfs/stats.rs index 1c450b4..1f4ce41 100644 --- a/crates/decentdb/src/vfs/stats.rs +++ b/crates/decentdb/src/vfs/stats.rs @@ -237,6 +237,24 @@ impl VfsFile for StatsVfsFile { Ok(written) } + fn write_all_at_many(&self, writes: &[(u64, &[u8])]) -> Result<()> { + self.inner.write_all_at_many(writes)?; + let state = global_stats_state(); + if state.enabled() { + let stats = state.file_stats(self.inner.kind()); + stats.write_calls.fetch_add( + u64::try_from(writes.len()).unwrap_or(u64::MAX), + Ordering::Relaxed, + ); + let bytes = writes + .iter() + .map(|(_, buf)| u64::try_from(buf.len()).unwrap_or(u64::MAX)) + .fold(0_u64, u64::saturating_add); + stats.bytes_written.fetch_add(bytes, Ordering::Relaxed); + } + Ok(()) + } + fn advise_sequential(&self) -> Result<()> { self.inner.advise_sequential() } diff --git a/crates/decentdb/src/wal/index.rs b/crates/decentdb/src/wal/index.rs index 7b711a2..4db7488 100644 --- a/crates/decentdb/src/wal/index.rs +++ b/crates/decentdb/src/wal/index.rs @@ -288,6 +288,10 @@ impl WalIndex { } } + pub(crate) fn has_dirty_since_demote(&self) -> bool { + !self.dirty_since_demote.is_empty() + } + pub(crate) fn unmark_dirty_since_demote(&mut self, page_id: PageId) { if !self.dirty_since_demote_set.remove(&page_id) { return; diff --git a/crates/decentdb/src/wal/writer.rs b/crates/decentdb/src/wal/writer.rs index 0c6642d..1a5b85c 100644 --- a/crates/decentdb/src/wal/writer.rs +++ b/crates/decentdb/src/wal/writer.rs @@ -187,6 +187,7 @@ pub(crate) fn commit_pages( offset = new_offset; sync_for_mode(wal, metadata_changed, new_offset)?; + let should_demote_cold_versions; { let mut index = wal .inner @@ -245,10 +246,14 @@ pub(crate) fn commit_pages( wal.inner .pages_since_checkpoint .fetch_add(prepared_count_u32, Ordering::AcqRel); + should_demote_cold_versions = + wal.inner.wal_index_hot_set_pages != 0 || index.has_dirty_since_demote(); } wal.publish_process_commit(offset)?; drop(writer_state); - demote_cold_versions(wal)?; + if should_demote_cold_versions { + demote_cold_versions(wal)?; + } maybe_auto_checkpoint(wal, pager)?; Ok(offset) } @@ -351,6 +356,7 @@ pub(crate) fn commit_pages_if_latest( offset = new_offset; sync_for_mode(wal, metadata_changed, new_offset)?; + let should_demote_cold_versions; { let mut index = wal .inner @@ -404,10 +410,14 @@ pub(crate) fn commit_pages_if_latest( wal.inner .pages_since_checkpoint .fetch_add(prepared_count_u32, Ordering::AcqRel); + should_demote_cold_versions = + wal.inner.wal_index_hot_set_pages != 0 || index.has_dirty_since_demote(); } wal.publish_process_commit(offset)?; drop(writer_state); - demote_cold_versions(wal)?; + if should_demote_cold_versions { + demote_cold_versions(wal)?; + } maybe_auto_checkpoint(wal, pager)?; Ok(offset) } diff --git a/crates/decentdb/tests/unit_wal_recovery_edge_cases_tests.rs b/crates/decentdb/tests/unit_wal_recovery_edge_cases_tests.rs index 26796ba..508b853 100644 --- a/crates/decentdb/tests/unit_wal_recovery_edge_cases_tests.rs +++ b/crates/decentdb/tests/unit_wal_recovery_edge_cases_tests.rs @@ -126,12 +126,15 @@ fn uncommitted_wal_frames_are_not_visible_after_reopen() { // Simulate a crash before the commit frame is written. Db::clear_failpoints().unwrap(); - Db::install_failpoint("wal.write_commit", "error", 1, 0).unwrap(); { let db = Db::open(&path, config.clone()).unwrap(); - let _ = db.execute("INSERT INTO t VALUES (2)"); // expected to fail + Db::install_failpoint("wal.write_commit", "error", 1, 0).unwrap(); + let error = db + .execute("INSERT INTO t VALUES (2)") + .expect_err("commit frame write should fail"); + assert!(matches!(error, DbError::Io { .. })); + Db::clear_failpoints().unwrap(); } - Db::clear_failpoints().unwrap(); // Re-open: only committed row (id=1) should be visible. let db = Db::open(&path, config).unwrap(); diff --git a/data/bench_summary.json b/data/bench_summary.json index 7f72725..63a489e 100644 --- a/data/bench_summary.json +++ b/data/bench_summary.json @@ -1,142 +1,113 @@ { "engines": { - "H2": { - "db_plus_wal_size_mb": 0.0, - "db_size_mb_main": 0.0, - "db_size_mb_source": "python:h2(jdbc)", - "os_cache_state": "unknown_os_cache", - "os_cache_state_source": "python:h2(jdbc)", - "process_state": "warm_process", - "process_state_source": "python:h2(jdbc)", - "read_p95_ms": 0.15576200007672014, - "storage_state": "reused_storage", - "storage_state_source": "python:h2(jdbc)", - "wal_size_mb": 0.0, - "wal_size_mb_source": "python:h2(jdbc)" - }, - "HSQLDB": { - "db_plus_wal_size_mb": 0.0, - "db_size_mb_main": 0.0, - "db_size_mb_source": "python:hsqldb(jdbc)", - "os_cache_state": "unknown_os_cache", - "os_cache_state_source": "python:hsqldb(jdbc)", - "process_state": "warm_process", - "process_state_source": "python:hsqldb(jdbc)", - "read_p95_ms": 0.24111100015034026, - "storage_state": "reused_storage", - "storage_state_source": "python:hsqldb(jdbc)", - "wal_size_mb": 0.0, - "wal_size_mb_source": "python:hsqldb(jdbc)" - }, "decentdb_balanced_durable": { - "aggregate_p95_ms": 0.12736039999999998, - "aggregate_p95_stddev_ms": 0.0004666813045323348, - "commit_p95_ms": 0.9060354, - "commit_p95_stddev_ms": 0.04657700872147116, - "concurrent_read_p95_ms": 0.0453704, - "concurrent_read_p95_stddev_ms": 0.0012208604506658408, - "concurrent_read_threads": 4, - "db_plus_wal_size_mb": 3.691436767578125, - "db_size_mb": 3.691436767578125, - "db_size_mb_main": 3.69140625, - "insert_rows_per_sec": 2657465.7364433794, - "insert_rps_stddev": 114921.16717212144, - "join_p95_ms": 0.028585400000000004, - "join_p95_stddev_ms": 0.0006733024877423228, - "range_scan_p95_ms": 0.6252152000000001, - "range_scan_p95_stddev_ms": 0.004671534368919912, - "read_p95_ms": 0.015484799999999998, - "read_p95_stddev_ms": 0.00020255014194021134, - "wal_size_mb": 3.0517578125e-05 + "read_p95_ms": 0.0011560000000000001, + "read_p95_stddev_ms": 0.00015134067529914093, + "join_p95_ms": 0.001583, + "join_p95_stddev_ms": 0.000257426494362954, + "commit_p95_ms": 3.0680708, + "commit_p95_stddev_ms": 0.0015794434969317037, + "insert_rows_per_sec": 2370472.477434227, + "insert_rps_stddev": 76355.01567279924, + "db_size_mb": 3.687530517578125, + "db_size_mb_main": 3.6875, + "wal_size_mb": 0.000030517578125, + "db_plus_wal_size_mb": 3.687530517578125, + "range_scan_p95_ms": 0.009545799999999998, + "range_scan_p95_stddev_ms": 0.0011745670521515577, + "aggregate_p95_ms": 0.0006293999999999999, + "aggregate_p95_stddev_ms": 0.00007103407632960394, + "concurrent_read_p95_ms": 0.009333800000000001, + "concurrent_read_p95_stddev_ms": 0.002637899270252752, + "concurrent_read_threads": 4 }, "decentdb_low_memory_durable": { - "aggregate_p95_ms": 0.12786939999999997, - "aggregate_p95_stddev_ms": 0.000495302170397021, - "commit_p95_ms": 0.871179, - "commit_p95_stddev_ms": 0.0208144716387421, - "concurrent_read_p95_ms": 0.044307, - "concurrent_read_p95_stddev_ms": 0.0009470983053516666, - "concurrent_read_threads": 4, - "db_plus_wal_size_mb": 3.691436767578125, - "db_size_mb": 3.691436767578125, - "db_size_mb_main": 3.69140625, - "insert_rows_per_sec": 2671951.013522356, - "insert_rps_stddev": 12566.05597565607, - "join_p95_ms": 0.030598999999999998, - "join_p95_stddev_ms": 0.002953626990667576, - "range_scan_p95_ms": 0.626549, - "range_scan_p95_stddev_ms": 0.00125886027818818, - "read_p95_ms": 0.0157276, - "read_p95_stddev_ms": 0.0001930456940726723, - "wal_size_mb": 3.0517578125e-05 + "read_p95_ms": 0.00105, + "read_p95_stddev_ms": 0.000024819347291981696, + "join_p95_ms": 0.0014124, + "join_p95_stddev_ms": 0.00010103583522691344, + "commit_p95_ms": 3.0700228000000003, + "commit_p95_stddev_ms": 0.007619817357391176, + "insert_rows_per_sec": 2327752.4590236107, + "insert_rps_stddev": 28565.854311925385, + "db_size_mb": 3.687530517578125, + "db_size_mb_main": 3.6875, + "wal_size_mb": 0.000030517578125, + "db_plus_wal_size_mb": 3.687530517578125, + "range_scan_p95_ms": 0.008851999999999999, + "range_scan_p95_stddev_ms": 0.00005607138307550521, + "aggregate_p95_ms": 0.0005771999999999999, + "aggregate_p95_stddev_ms": 0.000052247105182966845, + "concurrent_read_p95_ms": 0.0087148, + "concurrent_read_p95_stddev_ms": 0.002455557810355928, + "concurrent_read_threads": 4 }, "decentdb_tuned_durable": { - "aggregate_p95_ms": 0.030653399999999997, - "aggregate_p95_stddev_ms": 0.00017825666887945552, - "commit_p95_ms": 0.46221700000000004, - "commit_p95_stddev_ms": 0.01063252600278974, - "concurrent_read_p95_ms": 0.004815, - "concurrent_read_p95_stddev_ms": 0.001050921119780167, - "concurrent_read_threads": 4, - "db_plus_wal_size_mb": 3.210968017578125, + "read_p95_ms": 0.0008696000000000001, + "read_p95_stddev_ms": 0.00001935561933909635, + "join_p95_ms": 0.001106, + "join_p95_stddev_ms": 0.00010855413396089528, + "commit_p95_ms": 3.0673214, + "commit_p95_stddev_ms": 0.007062335154890256, + "insert_rows_per_sec": 2641010.195681443, + "insert_rps_stddev": 117630.73260262053, "db_size_mb": 3.210968017578125, "db_size_mb_main": 3.2109375, - "insert_rows_per_sec": 3251228.8774911426, - "insert_rps_stddev": 61382.750151157976, - "join_p95_ms": 0.0033504, - "join_p95_stddev_ms": 3.8536216731796604e-05, - "range_scan_p95_ms": 0.0123588, - "range_scan_p95_stddev_ms": 0.0003586454516650117, - "read_p95_ms": 0.0019974, - "read_p95_stddev_ms": 3.287917273898469e-05, - "wal_size_mb": 3.0517578125e-05 + "wal_size_mb": 0.000030517578125, + "db_plus_wal_size_mb": 3.210968017578125, + "range_scan_p95_ms": 0.0061034, + "range_scan_p95_stddev_ms": 0.000030374989711932418, + "aggregate_p95_ms": 0.0005694, + "aggregate_p95_stddev_ms": 0.000051437729343352615, + "concurrent_read_p95_ms": 0.009109200000000001, + "concurrent_read_p95_stddev_ms": 0.0035874395548914826, + "concurrent_read_threads": 4 }, "duckdb_engine_default": { - "aggregate_p95_ms": 0.2959944, - "aggregate_p95_stddev_ms": 0.005507752231173802, - "commit_p95_ms": 0.8386392, - "commit_p95_stddev_ms": 0.039832933170430705, - "concurrent_read_p95_ms": 0.22942800000000002, - "concurrent_read_p95_stddev_ms": 0.0022572003898635126, - "concurrent_read_threads": 4, - "db_plus_wal_size_mb": 3.26171875, + "read_p95_ms": 0.15063100000000001, + "read_p95_stddev_ms": 0.0008605365767938072, + "join_p95_ms": 0.33191560000000003, + "join_p95_stddev_ms": 0.012812787231512135, + "commit_p95_ms": 3.0641356, + "commit_p95_stddev_ms": 0.0030963806355162205, + "insert_rows_per_sec": 7184.7217589473, + "insert_rps_stddev": 36.27377018643688, "db_size_mb": 3.26171875, "db_size_mb_main": 3.26171875, - "insert_rows_per_sec": 4993.275606149956, - "insert_rps_stddev": 57.2979577214086, - "join_p95_ms": 0.48079859999999996, - "join_p95_stddev_ms": 0.0053281728050054945, - "range_scan_p95_ms": 1.0447058, - "range_scan_p95_stddev_ms": 0.11075548026061736, - "read_p95_ms": 0.2218696, - "read_p95_stddev_ms": 0.001274163662957002, - "wal_size_mb": 0.0 + "wal_size_mb": 0.0, + "db_plus_wal_size_mb": 3.26171875, + "range_scan_p95_ms": 0.8018722, + "range_scan_p95_stddev_ms": 0.09604424419276776, + "aggregate_p95_ms": 0.200773, + "aggregate_p95_stddev_ms": 0.004520799221376679, + "concurrent_read_p95_ms": 0.15536360000000002, + "concurrent_read_p95_stddev_ms": 0.00403552894178693, + "concurrent_read_threads": 4 }, "sqlite": { - "aggregate_p95_ms": 0.0351562, - "aggregate_p95_stddev_ms": 8.745375921022331e-05, - "commit_p95_ms": 0.48844180000000004, - "commit_p95_stddev_ms": 0.024638472496484033, - "concurrent_read_p95_ms": 0.038827, - "concurrent_read_p95_stddev_ms": 0.001107624123969861, - "concurrent_read_threads": 4, - "db_plus_wal_size_mb": 2.1953125, + "read_p95_ms": 0.002579, + "read_p95_stddev_ms": 0.00008428523002282197, + "join_p95_ms": 0.0031218, + "join_p95_stddev_ms": 0.00015111240849116267, + "commit_p95_ms": 3.0638628, + "commit_p95_stddev_ms": 0.0043429923969539475, + "insert_rows_per_sec": 2083404.9552702922, + "insert_rps_stddev": 35862.688079146596, "db_size_mb": 2.1953125, "db_size_mb_main": 2.1953125, - "insert_rows_per_sec": 2089870.293231697, - "insert_rps_stddev": 18257.729535046936, - "join_p95_ms": 0.003222, - "join_p95_stddev_ms": 3.1999999999999965e-05, - "range_scan_p95_ms": 0.011000999999999999, - "range_scan_p95_stddev_ms": 0.0004538995483584446, - "read_p95_ms": 0.0028414, - "read_p95_stddev_ms": 8.212186067059174e-06, - "wal_size_mb": 0.0 + "wal_size_mb": 0.0, + "db_plus_wal_size_mb": 2.1953125, + "range_scan_p95_ms": 0.0144852, + "range_scan_p95_stddev_ms": 0.000567469611521181, + "aggregate_p95_ms": 0.030896, + "aggregate_p95_stddev_ms": 0.0007275284186889194, + "concurrent_read_p95_ms": 0.015523200000000001, + "concurrent_read_p95_stddev_ms": 0.0008814745373520442, + "concurrent_read_threads": 4 } }, "metadata": { "aggregate_workload": "prepared_count_sum_aggregate_on_user_id", - "aggregated_at": "2026-06-10T20:20:55.323790+00:00", "benchmark_profile": "single_thread_prepared_statement_oltp_with_concurrent_read_extension", "binding_parity_note": "sqlite via rusqlite FFI; duckdb via duckdb-rs FFI; decentdb via native Rust API (zero FFI overhead in benchmark loop)", "commit_workload": "prepared_single_row_auto_commit_insert_p95", @@ -160,17 +131,15 @@ "join_workload": "prepared_inner_join_lookup_with_value_materialization", "latency_capture_unit": "nanoseconds", "latency_report_unit": "milliseconds", - "machine": "AMD EPYC 7763 64-Core Processor", - "notes": "merged extra engines from .tmp/benchmark-assets/root-readme-results.json", + "machine": "AMD Ryzen 9 3900X 12-Core Processor", "os": "linux", "os_cache_state": "unknown_os_cache", "process_coordination_note": "native cross-engine benchmarks are single-process comparisons; cross-process coordination overhead is measured separately", "process_state": "warm_process", - "python_merge_target_operations": 500, "range_scan_workload": "prepared_range_scan_50_rows_ordered", "read_pattern": "deterministic_permutation", "read_workload": "prepared_point_lookup_with_value_materialization", - "run_id": "1781122269865", + "run_id": "1781237431944", "size_measurement": "db_plus_wal_after_checkpoint", "sqlite_durability": "wal+synchronous_full+wal_autocheckpoint_0", "sqlite_profile": "sqlite_wal_full", @@ -179,4 +148,4 @@ "storage_state": "reused_storage", "wal_autocheckpoint_note": "sqlite wal_autocheckpoint=0 is a benchmark-specific tuning; production defaults differ" } -} +} \ No newline at end of file diff --git a/design/METRIC_IMPROVEMENTS_PLAN.md b/design/METRIC_IMPROVEMENTS_PLAN.md index 048a0c9..916550a 100644 --- a/design/METRIC_IMPROVEMENTS_PLAN.md +++ b/design/METRIC_IMPROVEMENTS_PLAN.md @@ -73,6 +73,47 @@ Notes: `data/bench_summary.json`, but they are not currently rendered in the README images. +## Current Worktree Public Metrics + +Latest local public benchmark: + +```bash +cargo bench -p decentdb --bench embedded_compare +``` + +Output summary: `data/bench_summary.json`, generated on 2026-06-12 from the +`pk-lookup-profiled` worktree. + +| Metric | SQLite | Balanced | Balanced vs SQLite | Low-memory | Low-memory vs SQLite | Tuned | Tuned vs SQLite | Current status | +|---|---:|---:|---:|---:|---:|---:|---:|---| +| `insert_rows_per_sec` | 2,083,405 rows/s | 2,370,472 rows/s | 1.14x | 2,327,752 rows/s | 1.12x | 2,641,010 rows/s | 1.27x | DecentDB wins | +| `read_p95_ms` | 0.002579 ms | 0.001156 ms | 2.23x | 0.001050 ms | 2.46x | 0.000870 ms | 2.97x | DecentDB wins | +| `commit_p95_ms` | 3.063863 ms | 3.068071 ms | 0.999x | 3.070023 ms | 0.998x | 3.067321 ms | 0.999x | At parity, not a beyond-noise win | +| `join_p95_ms` | 0.003122 ms | 0.001583 ms | 1.97x | 0.001412 ms | 2.21x | 0.001106 ms | 2.82x | DecentDB wins | +| `range_scan_p95_ms` | 0.014485 ms | 0.009546 ms | 1.52x | 0.008852 ms | 1.64x | 0.006103 ms | 2.37x | DecentDB wins | +| `aggregate_p95_ms` | 0.030896 ms | 0.000629 ms | 49.09x | 0.000577 ms | 53.53x | 0.000569 ms | 54.26x | DecentDB wins | +| `concurrent_read_p95_ms` | 0.015523 ms | 0.009334 ms | 1.66x | 0.008715 ms | 1.78x | 0.009109 ms | 1.70x | DecentDB wins | + +Interpretation: + +- The public README read-side metrics now exceed SQLite for every DecentDB + profile in this worktree: point lookup, indexed range scan, join lookup, + aggregate, concurrent read, and bulk insert throughput are all ahead. +- Durable commit p95 is at the same single-`fsync` floor as SQLite. Multiple + local runs have moved both engines by several microseconds, and the latest + run has DecentDB 0.1-0.2% behind SQLite with overlapping standard deviations. + This must stay tracked as the remaining public metric blocker because it is + not a beyond-noise DecentDB win. +- The current no-ADR commit-path work reduced engine overhead without changing + durability: batched WAL writes now pass through VFS wrappers, no-failpoint + VFS operations avoid failpoint-registry mutexes, no-op reactive publish + returns before hub lookup, and prepared auto-commit inserts skip redundant + post-commit re-deferral when no touched table is paged. +- Further durable-commit improvement likely needs either a clearly measured + syscall-level optimization or an ADR-backed WAL/recovery change. Do not relax + `WalSyncMode::Full`, skip the WAL header end-offset update, or otherwise + weaken ACID semantics to win this metric. + ## Rust-Baseline SQLite Comparison The rust-baseline workload is not the public README chart input. It is a larger @@ -180,14 +221,15 @@ Next point-lookup follow-ups: | Rank | Priority metric / area | Public chart coverage | Rust-baseline coverage | Baseline status | Target | |---:|---|---|---|---|---| -| 1 | Point lookup latency | `read_p95_ms` | `query_artist_by_id` | Worktree now wins rust-baseline medium/full/huge and cuts smoke roughly in half, but smoke median still trails SQLite; public chart rerun pending | Finish fixed-overhead work, rerun public benchmark, keep tuned ahead and close smoke gap | -| 2 | Range scan latency | `range_scan_p95_ms` | Partial overlap through indexed scans and view paths | Tuned public row is 0.89x vs SQLite | Bring tuned above 1.00x vs SQLite and reduce balanced gap | -| 3 | Join and view lookup latency | `join_p95_ms` | `query_view_first_1000`, `query_songs_for_artist_via_view` | Tuned public row is 0.96x; rust-baseline view paths lose strongly | Bring public join above 1.00x and reduce view-path latency materially | -| 4 | Durable commit latency | `commit_p95_ms` | Not directly represented in rust-baseline totals | Tuned public row wins narrowly at 1.06x | Protect or improve without weakening ACID guarantees | -| 5 | Concurrent read latency | `concurrent_read_p95_ms` | Not directly represented in rust-baseline | Tuned public row wins strongly | Protect; watch for reader-cache or locking regressions | -| 6 | Aggregate latency | `aggregate_p95_ms` | `query_aggregate_durations`, grouped Top-N queries | Tuned public row wins; rust-baseline wins strongly | Protect wins; optimize only if shared hot-path work helps higher priorities | -| 7 | Insert throughput | `insert_rows_per_sec` | `seed_songs` and seed loops | DecentDB wins public and rust-baseline insert paths | Protect wins; avoid trading write durability for chart gains | -| 8 | Size and memory | Stored in summary, not charted | RSS, DB size, WAL size in rust-baseline JSON | Not public-charted today | Track opportunistically; consider adding public visibility later | +| 1 | Durable commit latency | `commit_p95_ms` | Not directly represented in rust-baseline totals | Current worktree is at parity but still 0.1-0.2% behind SQLite in the latest public run | Find a no-durability-regression win beyond sync noise, or document that an ADR-level WAL change is required | +| 2 | Rust-baseline view lookup latency | Not directly charted | `query_view_first_1000`, `query_songs_for_artist_via_view` | Public join now wins, but rust-baseline view paths still lose strongly | Reduce view expansion/materialization overhead without regressing public join/range wins | +| 3 | Point lookup latency | `read_p95_ms` | `query_artist_by_id` | Public metric now wins across profiles; rust-baseline medium/full/huge win, smoke median remains close | Protect public wins and close remaining smoke fixed-overhead gap opportunistically | +| 4 | Range scan latency | `range_scan_p95_ms` | Partial overlap through indexed scans and view paths | Public metric now wins across profiles | Protect; optimize only if shared view/range work helps rust-baseline | +| 5 | Join latency | `join_p95_ms` | View and join query shapes | Public metric now wins across profiles | Protect public win while improving rust-baseline views | +| 6 | Concurrent read latency | `concurrent_read_p95_ms` | Not directly represented in rust-baseline | Public metric now wins across profiles | Protect; watch for reader-cache or locking regressions | +| 7 | Aggregate latency | `aggregate_p95_ms` | `query_aggregate_durations`, grouped Top-N queries | Public and rust-baseline aggregate paths win strongly | Protect wins; optimize only if shared hot-path work helps higher priorities | +| 8 | Insert throughput | `insert_rows_per_sec` | `seed_songs` and seed loops | DecentDB wins public and rust-baseline insert paths | Protect wins; avoid trading write durability for chart gains | +| 9 | Size and memory | Stored in summary, not charted | RSS, DB size, WAL size in rust-baseline JSON | Not public-charted today | Track opportunistically; consider adding public visibility later | ## Execution Plan From 5a77f060dfeb7f823296d42723c9c98428b89a94 Mon Sep 17 00:00:00 2001 From: Steven Hildreth Date: Fri, 12 Jun 2026 08:49:02 -0500 Subject: [PATCH 7/9] Implement benchmark suite and reporting for Rust baseline - Added a new command-line argument `--benchmark` to run all scales (smoke, medium, full, huge) in order. - Introduced `BENCHMARK_SCALES` constant to define the order of benchmark scales. - Refactored the `run` function to handle benchmark execution and reporting. - Created `run_benchmark_suite` function to execute benchmarks for each scale and generate reports. - Added helper functions for file size retrieval and WAL path generation. - Updated the report generation logic to include additional metrics such as database and WAL sizes before and after checkpoints. - Added new JSON result files for benchmark runs at different scales (smoke, medium, full, huge) with detailed step metrics. --- benchmarks/rust-baseline/README.md | 62 ++--- ...06-12-1340-rust-baseline-default-full.json | 155 +++++++++++++ ...-12-1340-rust-baseline-default-medium.json | 155 +++++++++++++ ...6-12-1340-rust-baseline-default-smoke.json | 155 +++++++++++++ ...06-12-1342-rust-baseline-default-huge.json | 155 +++++++++++++ benchmarks/rust-baseline/results/report.html | 4 +- benchmarks/rust-baseline/src/main.rs | 215 ++++++++++++++---- 7 files changed, 825 insertions(+), 76 deletions(-) create mode 100644 benchmarks/rust-baseline/results/2026-06-12-1340-rust-baseline-default-full.json create mode 100644 benchmarks/rust-baseline/results/2026-06-12-1340-rust-baseline-default-medium.json create mode 100644 benchmarks/rust-baseline/results/2026-06-12-1340-rust-baseline-default-smoke.json create mode 100644 benchmarks/rust-baseline/results/2026-06-12-1342-rust-baseline-default-huge.json diff --git a/benchmarks/rust-baseline/README.md b/benchmarks/rust-baseline/README.md index 64bfe2d..cc020b8 100644 --- a/benchmarks/rust-baseline/README.md +++ b/benchmarks/rust-baseline/README.md @@ -30,17 +30,20 @@ theoretical engine ceiling that any binding could approach but never beat. The SQLite path uses `rusqlite` against the same generated workload, with `journal_mode=WAL`, `synchronous=FULL`, and `wal_autocheckpoint=0`. Each seed -phase runs in one explicit `BEGIN IMMEDIATE` transaction, and query timing -materializes every returned column before counting a row. +phase runs in one explicit `BEGIN IMMEDIATE` transaction. After seeding, both +engines run a measured WAL checkpoint before query timing starts: DecentDB uses +`Db::checkpoint_wal()` and SQLite uses `PRAGMA wal_checkpoint(TRUNCATE)`. Query +timing materializes every returned column before counting a row. ## Schema and queries - `artists`, `albums`, `songs` tables with the same columns/PKs. - 5 secondary indexes (`idx_albums_artist`, `idx_songs_album`, etc.). - `v_artist_songs` view joining all three. -- 12 instrumented steps: `connect_open`, `schema_create`, three seed loops, - and seven query shapes including `COUNT(*)`, aggregates, by-id lookup, - Top-10 artists/albums by song count, and view scans. +- 13 instrumented steps: `connect_open`, `schema_create`, three seed loops, + `checkpoint_after_seed`, and seven query shapes including `COUNT(*)`, + aggregates, by-id lookup, Top-10 artists/albums by song count, and view + scans. ## Scales @@ -65,6 +68,8 @@ counts are reported as `Plan: artists=… total_albums=… total_songs=…`. ```bash cd /home/steven/src/github/decentdb/benchmarks/rust-baseline cargo build --release +./target/release/rust-baseline --engine decentdb --benchmark +./target/release/rust-baseline --engine sqlite --benchmark ./target/release/rust-baseline --engine decentdb --scale smoke ./target/release/rust-baseline --engine decentdb --scale medium ./target/release/rust-baseline --engine decentdb --scale full @@ -75,6 +80,12 @@ cargo build --release ./target/release/rust-baseline --report --report-file /tmp/rust-baseline-report.html ``` +Use `--benchmark` to run all scales in order (`smoke`, `medium`, `full`, +`huge`) for the selected engine/profile and then generate the same HTML report +as `--report`. Suite mode uses the default per-engine/per-scale database paths +and rejects `--db-path`; use single-scale mode when you need to pin an exact +database file. + To run the full DecentDB-vs-SQLite comparison into a temporary output directory, use: @@ -82,20 +93,13 @@ directory, use: cd /home/steven/src/github/decentdb/benchmarks/rust-baseline cargo build --release OUT="$PWD/../../.tmp/rust-baseline-compare/results" -DBS="$PWD/../../.tmp/rust-baseline-compare/dbs" -mkdir -p "$OUT" "$DBS" -for scale in smoke medium full huge; do - ./target/release/rust-baseline \ - --engine decentdb \ - --scale "$scale" \ - --out-dir "$OUT" \ - --db-path "$DBS/run-decentdb-$scale.ddb" - ./target/release/rust-baseline \ - --engine sqlite \ - --scale "$scale" \ - --out-dir "$OUT" \ - --db-path "$DBS/run-sqlite-$scale.db" -done +mkdir -p "$OUT" +./target/release/rust-baseline --engine decentdb --benchmark --out-dir "$OUT" +./target/release/rust-baseline \ + --engine sqlite \ + --benchmark \ + --out-dir "$OUT" \ + --report-file "$OUT/report.html" ``` ## Profiles @@ -134,16 +138,19 @@ results/ ``` Each JSON report records `binding`, `benchmark_profile`, `engine_version`, -database/WAL size, peak RSS, total runtime, and every instrumented step. Use -`binding` to separate DecentDB (`RustRaw`) from SQLite (`SQLiteRusqlite`) when -comparing runs programmatically. +database/WAL size after the run, peak RSS, total runtime, and every +instrumented step. The `checkpoint_after_seed` step records checkpoint duration +plus WAL/database bytes before and after the checkpoint in its `extra` object. +Use `binding` to separate DecentDB (`RustRaw`) from SQLite (`SQLiteRusqlite`) +when comparing runs programmatically. ### Historical HTML report -`--report` is a **report-only** mode: it does not run a benchmark. Instead it -loads every `*.json` result in `results/`, groups runs by scale (`smoke`, -`medium`, `full`, `huge`), and writes a static HTML report to -`results/report.html` by default. +`--report` is a **report-only** mode when used by itself: it does not run a +benchmark. Instead it loads every `*.json` result in `results/`, groups runs by +scale (`smoke`, `medium`, `full`, `huge`), and writes a static HTML report to +`results/report.html` by default. `--benchmark` runs the suite first and then +performs this report generation step automatically. The generated report includes: @@ -154,7 +161,8 @@ The generated report includes: - raw run-history tables and per-step summary tables so regressions and improvements are easy to spot over time -Use `--report-file ` with `--report` to override the output path. +Use `--report-file ` with `--report` or `--benchmark` to override the +output path. ## Engine memory observation (worth filing) diff --git a/benchmarks/rust-baseline/results/2026-06-12-1340-rust-baseline-default-full.json b/benchmarks/rust-baseline/results/2026-06-12-1340-rust-baseline-default-full.json new file mode 100644 index 0000000..45da608 --- /dev/null +++ b/benchmarks/rust-baseline/results/2026-06-12-1340-rust-baseline-default-full.json @@ -0,0 +1,155 @@ +{ + "binding": "RustRaw", + "scale_name": "full", + "benchmark_profile": "default", + "target_artists": 50000, + "target_albums": 500000, + "target_songs_cap": 5000000, + "started_unix": 1781271637, + "finished_unix": 1781271652, + "engine_version": "2.10.0", + "database_path": "run-rust-full.ddb", + "database_size_bytes": 161189888, + "wal_size_bytes": 32, + "peak_rss_bytes": 708538368, + "steps": [ + { + "name": "connect_open", + "duration_seconds": 0.006048563, + "records": null, + "records_per_second": null, + "rss_bytes": 39452672, + "rss_anon_kb": 28612, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "schema_create", + "duration_seconds": 0.007789002, + "records": null, + "records_per_second": null, + "rss_bytes": 11993088, + "rss_anon_kb": 1796, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "seed_artists", + "duration_seconds": 0.055255581, + "records": 50000, + "records_per_second": 904885.9697991413, + "rss_bytes": 30941184, + "rss_anon_kb": 20300, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "seed_albums", + "duration_seconds": 0.471023433, + "records": 500000, + "records_per_second": 1061518.3130390034, + "rss_bytes": 203309056, + "rss_anon_kb": 188628, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "seed_songs", + "duration_seconds": 2.122317449, + "records": 2749816, + "records_per_second": 1295666.6785619967, + "rss_bytes": 708538368, + "rss_anon_kb": 682016, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "checkpoint_after_seed", + "duration_seconds": 0.342206754, + "records": null, + "records_per_second": null, + "rss_bytes": 538898432, + "rss_anon_kb": 516352, + "rss_file_kb": 9916, + "extra": { + "checkpoint_mode": "wal", + "database_bytes_after": 161189888, + "database_bytes_before": 8192, + "wal_bytes_after": 32, + "wal_bytes_before": 167772160 + } + }, + { + "name": "query_count_songs", + "duration_seconds": 0.090038021, + "records": null, + "records_per_second": null, + "rss_bytes": 355319808, + "rss_anon_kb": 337076, + "rss_file_kb": 9916, + "extra": { + "count": 2749816 + } + }, + { + "name": "query_aggregate_durations", + "duration_seconds": 0.129008099, + "records": null, + "records_per_second": null, + "rss_bytes": 355364864, + "rss_anon_kb": 337120, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "query_artist_by_id", + "duration_seconds": 0.000075441, + "records": null, + "records_per_second": null, + "rss_bytes": 355364864, + "rss_anon_kb": 337120, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "query_top10_artists_by_songs", + "duration_seconds": 3.407594322, + "records": null, + "records_per_second": null, + "rss_bytes": 150626304, + "rss_anon_kb": 137180, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "query_top10_albums_by_songs", + "duration_seconds": 4.906226985, + "records": null, + "records_per_second": null, + "rss_bytes": 150614016, + "rss_anon_kb": 137168, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "query_view_first_1000", + "duration_seconds": 2.097430938, + "records": null, + "records_per_second": null, + "rss_bytes": 408055808, + "rss_anon_kb": 388576, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "query_songs_for_artist_via_view", + "duration_seconds": 0.868569525, + "records": null, + "records_per_second": null, + "rss_bytes": 407834624, + "rss_anon_kb": 388360, + "rss_file_kb": 9916, + "extra": {} + } + ] +} \ No newline at end of file diff --git a/benchmarks/rust-baseline/results/2026-06-12-1340-rust-baseline-default-medium.json b/benchmarks/rust-baseline/results/2026-06-12-1340-rust-baseline-default-medium.json new file mode 100644 index 0000000..97727d5 --- /dev/null +++ b/benchmarks/rust-baseline/results/2026-06-12-1340-rust-baseline-default-medium.json @@ -0,0 +1,155 @@ +{ + "binding": "RustRaw", + "scale_name": "medium", + "benchmark_profile": "default", + "target_artists": 5000, + "target_albums": 50000, + "target_songs_cap": 500000, + "started_unix": 1781271636, + "finished_unix": 1781271637, + "engine_version": "2.10.0", + "database_path": "run-rust-medium.ddb", + "database_size_bytes": 15314944, + "wal_size_bytes": 32, + "peak_rss_bytes": 92336128, + "steps": [ + { + "name": "connect_open", + "duration_seconds": 0.003039259, + "records": null, + "records_per_second": null, + "rss_bytes": 13885440, + "rss_anon_kb": 3644, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "schema_create", + "duration_seconds": 0.004280501, + "records": null, + "records_per_second": null, + "rss_bytes": 11636736, + "rss_anon_kb": 1448, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "seed_artists", + "duration_seconds": 0.008510327, + "records": 5000, + "records_per_second": 587521.490067303, + "rss_bytes": 13381632, + "rss_anon_kb": 3152, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "seed_albums", + "duration_seconds": 0.048201539, + "records": 50000, + "records_per_second": 1037311.2775507022, + "rss_bytes": 33234944, + "rss_anon_kb": 22540, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "seed_songs", + "duration_seconds": 0.184605231, + "records": 276243, + "records_per_second": 1496398.5500497546, + "rss_bytes": 92336128, + "rss_anon_kb": 80256, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "checkpoint_after_seed", + "duration_seconds": 0.039218324, + "records": null, + "records_per_second": null, + "rss_bytes": 72036352, + "rss_anon_kb": 60432, + "rss_file_kb": 9916, + "extra": { + "checkpoint_mode": "wal", + "database_bytes_after": 15314944, + "database_bytes_before": 8192, + "wal_bytes_after": 32, + "wal_bytes_before": 16777216 + } + }, + { + "name": "query_count_songs", + "duration_seconds": 0.008567685, + "records": null, + "records_per_second": null, + "rss_bytes": 56029184, + "rss_anon_kb": 44800, + "rss_file_kb": 9916, + "extra": { + "count": 276243 + } + }, + { + "name": "query_aggregate_durations", + "duration_seconds": 0.013218261, + "records": null, + "records_per_second": null, + "rss_bytes": 56029184, + "rss_anon_kb": 44800, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "query_artist_by_id", + "duration_seconds": 0.000068528, + "records": null, + "records_per_second": null, + "rss_bytes": 56029184, + "rss_anon_kb": 44800, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "query_top10_artists_by_songs", + "duration_seconds": 0.348935494, + "records": null, + "records_per_second": null, + "rss_bytes": 25616384, + "rss_anon_kb": 15100, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "query_top10_albums_by_songs", + "duration_seconds": 0.504654391, + "records": null, + "records_per_second": null, + "rss_bytes": 25702400, + "rss_anon_kb": 15184, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "query_view_first_1000", + "duration_seconds": 0.217226281, + "records": null, + "records_per_second": null, + "rss_bytes": 55074816, + "rss_anon_kb": 43868, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "query_songs_for_artist_via_view", + "duration_seconds": 0.092963807, + "records": null, + "records_per_second": null, + "rss_bytes": 54841344, + "rss_anon_kb": 43640, + "rss_file_kb": 9916, + "extra": {} + } + ] +} \ No newline at end of file diff --git a/benchmarks/rust-baseline/results/2026-06-12-1340-rust-baseline-default-smoke.json b/benchmarks/rust-baseline/results/2026-06-12-1340-rust-baseline-default-smoke.json new file mode 100644 index 0000000..a296781 --- /dev/null +++ b/benchmarks/rust-baseline/results/2026-06-12-1340-rust-baseline-default-smoke.json @@ -0,0 +1,155 @@ +{ + "binding": "RustRaw", + "scale_name": "smoke", + "benchmark_profile": "default", + "target_artists": 500, + "target_albums": 5000, + "target_songs_cap": 50000, + "started_unix": 1781271636, + "finished_unix": 1781271636, + "engine_version": "2.10.0", + "database_path": "run-rust-smoke.ddb", + "database_size_bytes": 1478656, + "wal_size_bytes": 32, + "peak_rss_bytes": 19906560, + "steps": [ + { + "name": "connect_open", + "duration_seconds": 0.005303462, + "records": null, + "records_per_second": null, + "rss_bytes": 6012928, + "rss_anon_kb": 628, + "rss_file_kb": 5244, + "extra": {} + }, + { + "name": "schema_create", + "duration_seconds": 0.003744304, + "records": null, + "records_per_second": null, + "rss_bytes": 10121216, + "rss_anon_kb": 864, + "rss_file_kb": 9020, + "extra": {} + }, + { + "name": "seed_artists", + "duration_seconds": 0.002763662, + "records": 500, + "records_per_second": 180919.3743663299, + "rss_bytes": 10498048, + "rss_anon_kb": 1168, + "rss_file_kb": 9084, + "extra": {} + }, + { + "name": "seed_albums", + "duration_seconds": 0.007998375, + "records": 5000, + "records_per_second": 625126.9789175927, + "rss_bytes": 12644352, + "rss_anon_kb": 3136, + "rss_file_kb": 9212, + "extra": {} + }, + { + "name": "seed_songs", + "duration_seconds": 0.021034434, + "records": 27783, + "records_per_second": 1320834.2092779868, + "rss_bytes": 19906560, + "rss_anon_kb": 10228, + "rss_file_kb": 9212, + "extra": {} + }, + { + "name": "checkpoint_after_seed", + "duration_seconds": 0.008046286, + "records": null, + "records_per_second": null, + "rss_bytes": 16977920, + "rss_anon_kb": 7368, + "rss_file_kb": 9212, + "extra": { + "checkpoint_mode": "wal", + "database_bytes_after": 1478656, + "database_bytes_before": 8192, + "wal_bytes_after": 32, + "wal_bytes_before": 16777216 + } + }, + { + "name": "query_count_songs", + "duration_seconds": 0.000975332, + "records": null, + "records_per_second": null, + "rss_bytes": 15409152, + "rss_anon_kb": 5708, + "rss_file_kb": 9340, + "extra": { + "count": 27783 + } + }, + { + "name": "query_aggregate_durations", + "duration_seconds": 0.001442269, + "records": null, + "records_per_second": null, + "rss_bytes": 15794176, + "rss_anon_kb": 5764, + "rss_file_kb": 9660, + "extra": {} + }, + { + "name": "query_artist_by_id", + "duration_seconds": 0.000037541, + "records": null, + "records_per_second": null, + "rss_bytes": 15794176, + "rss_anon_kb": 5764, + "rss_file_kb": 9660, + "extra": {} + }, + { + "name": "query_top10_artists_by_songs", + "duration_seconds": 0.034113804, + "records": null, + "records_per_second": null, + "rss_bytes": 12931072, + "rss_anon_kb": 2968, + "rss_file_kb": 9660, + "extra": {} + }, + { + "name": "query_top10_albums_by_songs", + "duration_seconds": 0.049339026, + "records": null, + "records_per_second": null, + "rss_bytes": 12886016, + "rss_anon_kb": 2924, + "rss_file_kb": 9660, + "extra": {} + }, + { + "name": "query_view_first_1000", + "duration_seconds": 0.021542078, + "records": null, + "records_per_second": null, + "rss_bytes": 15343616, + "rss_anon_kb": 5260, + "rss_file_kb": 9724, + "extra": {} + }, + { + "name": "query_songs_for_artist_via_view", + "duration_seconds": 0.008561393, + "records": null, + "records_per_second": null, + "rss_bytes": 15339520, + "rss_anon_kb": 5064, + "rss_file_kb": 9916, + "extra": {} + } + ] +} \ No newline at end of file diff --git a/benchmarks/rust-baseline/results/2026-06-12-1342-rust-baseline-default-huge.json b/benchmarks/rust-baseline/results/2026-06-12-1342-rust-baseline-default-huge.json new file mode 100644 index 0000000..4a73beb --- /dev/null +++ b/benchmarks/rust-baseline/results/2026-06-12-1342-rust-baseline-default-huge.json @@ -0,0 +1,155 @@ +{ + "binding": "RustRaw", + "scale_name": "huge", + "benchmark_profile": "default", + "target_artists": 250000, + "target_albums": 2500000, + "target_songs_cap": 25000000, + "started_unix": 1781271652, + "finished_unix": 1781271732, + "engine_version": "2.10.0", + "database_path": "run-rust-huge.ddb", + "database_size_bytes": 833892352, + "wal_size_bytes": 32, + "peak_rss_bytes": 3446149120, + "steps": [ + { + "name": "connect_open", + "duration_seconds": 0.005878543, + "records": null, + "records_per_second": null, + "rss_bytes": 250970112, + "rss_anon_kb": 235172, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "schema_create", + "duration_seconds": 0.031754994, + "records": null, + "records_per_second": null, + "rss_bytes": 12103680, + "rss_anon_kb": 1904, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "seed_artists", + "duration_seconds": 0.256828467, + "records": 250000, + "records_per_second": 973412.3437336874, + "rss_bytes": 101318656, + "rss_anon_kb": 89028, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "seed_albums", + "duration_seconds": 2.517898258, + "records": 2500000, + "records_per_second": 992891.5880762328, + "rss_bytes": 924823552, + "rss_anon_kb": 893232, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "seed_songs", + "duration_seconds": 18.041484802, + "records": 13746520, + "records_per_second": 761939.5050276639, + "rss_bytes": 3446149120, + "rss_anon_kb": 3355464, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "checkpoint_after_seed", + "duration_seconds": 1.9839196810000002, + "records": null, + "records_per_second": null, + "rss_bytes": 2589343744, + "rss_anon_kb": 2518740, + "rss_file_kb": 9916, + "extra": { + "checkpoint_mode": "wal", + "database_bytes_after": 833892352, + "database_bytes_before": 8192, + "wal_bytes_after": 32, + "wal_bytes_before": 805306368 + } + }, + { + "name": "query_count_songs", + "duration_seconds": 0.416849104, + "records": null, + "records_per_second": null, + "rss_bytes": 1683533824, + "rss_anon_kb": 1634160, + "rss_file_kb": 9916, + "extra": { + "count": 13746520 + } + }, + { + "name": "query_aggregate_durations", + "duration_seconds": 0.666562944, + "records": null, + "records_per_second": null, + "rss_bytes": 1683550208, + "rss_anon_kb": 1634176, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "query_artist_by_id", + "duration_seconds": 0.000087795, + "records": null, + "records_per_second": null, + "rss_bytes": 1683550208, + "rss_anon_kb": 1634176, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "query_top10_artists_by_songs", + "duration_seconds": 16.653227525, + "records": null, + "records_per_second": null, + "rss_bytes": 723496960, + "rss_anon_kb": 696624, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "query_top10_albums_by_songs", + "duration_seconds": 24.443281897, + "records": null, + "records_per_second": null, + "rss_bytes": 723472384, + "rss_anon_kb": 696600, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "query_view_first_1000", + "duration_seconds": 10.722553411, + "records": null, + "records_per_second": null, + "rss_bytes": 2124804096, + "rss_anon_kb": 2065088, + "rss_file_kb": 9916, + "extra": {} + }, + { + "name": "query_songs_for_artist_via_view", + "duration_seconds": 4.521317772, + "records": null, + "records_per_second": null, + "rss_bytes": 2124570624, + "rss_anon_kb": 2064860, + "rss_file_kb": 9916, + "extra": {} + } + ] +} \ No newline at end of file diff --git a/benchmarks/rust-baseline/results/report.html b/benchmarks/rust-baseline/results/report.html index 0a8587f..a7bd0fe 100644 --- a/benchmarks/rust-baseline/results/report.html +++ b/benchmarks/rust-baseline/results/report.html @@ -142,14 +142,14 @@

DecentDB rust-baseline analytics report

-

Generated 2026-05-28 03:07:38 UTC from results using 43 historical run(s).

+

Generated 2026-06-12 13:42:13 UTC from results using 47 historical run(s).

- +