From 0445d04c9e40daa323a4269157b7abb361dd2177 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 8 Jun 2026 09:28:56 -0700 Subject: [PATCH] refactor: replace deepsize crate with custom DeepSizeOf trait for Arrow-aware memory accounting The external `deepsize` crate does not correctly account for Arrow buffers that are shared across `Arc` references, causing double-counting in cache size calculations. This PR introduces a custom `DeepSizeOf` trait in `lance-core::deepsize` with a `Context` that tracks both `Arc` and raw buffer pointers in a unified `HashSet`. It also adds a `lance-derive` proc-macro crate for `#[derive(DeepSizeOf)]` and removes the dependency on the external `deepsize` crate. Co-Authored-By: Claude Sonnet 4.6 --- Cargo.lock | 39 +- Cargo.toml | 3 +- java/lance-jni/Cargo.lock | 138 +++--- python/Cargo.lock | 39 +- rust/lance-core/Cargo.toml | 3 +- rust/lance-core/src/cache/mod.rs | 2 +- rust/lance-core/src/container/list.rs | 4 +- rust/lance-core/src/datatypes.rs | 6 +- rust/lance-core/src/datatypes/field.rs | 2 +- rust/lance-core/src/datatypes/schema.rs | 2 +- rust/lance-core/src/deepsize.rs | 457 ++++++++++++++++++ rust/lance-core/src/lib.rs | 4 + rust/lance-core/src/utils/deletion.rs | 4 +- rust/lance-derive/Cargo.toml | 22 + rust/lance-derive/src/lib.rs | 119 +++++ .../src/encodings/logical/primitive/blob.rs | 6 +- rust/lance-file/Cargo.toml | 1 - .../src/previous/format/metadata.rs | 2 +- rust/lance-file/src/previous/page_table.rs | 2 +- rust/lance-file/src/previous/reader.rs | 2 +- rust/lance-file/src/reader.rs | 4 +- rust/lance-index/Cargo.toml | 1 - rust/lance-index/src/lib.rs | 2 +- rust/lance-index/src/scalar.rs | 2 +- rust/lance-index/src/scalar/bitmap.rs | 6 +- rust/lance-index/src/scalar/bloomfilter.rs | 6 +- rust/lance-index/src/scalar/btree.rs | 10 +- rust/lance-index/src/scalar/btree/flat.rs | 4 +- rust/lance-index/src/scalar/fmindex.rs | 6 +- .../src/scalar/inverted/builder.rs | 4 +- rust/lance-index/src/scalar/inverted/index.rs | 20 +- .../src/scalar/inverted/lazy_docset.rs | 4 +- rust/lance-index/src/scalar/json.rs | 4 +- rust/lance-index/src/scalar/label_list.rs | 4 +- rust/lance-index/src/scalar/lance_format.rs | 4 +- rust/lance-index/src/scalar/ngram.rs | 8 +- rust/lance-index/src/scalar/rtree.rs | 8 +- rust/lance-index/src/scalar/zonemap.rs | 6 +- rust/lance-index/src/vector.rs | 2 +- rust/lance-index/src/vector/bq/builder.rs | 2 +- rust/lance-index/src/vector/bq/storage.rs | 10 +- rust/lance-index/src/vector/flat/index.rs | 2 +- rust/lance-index/src/vector/flat/storage.rs | 6 +- rust/lance-index/src/vector/graph.rs | 2 +- rust/lance-index/src/vector/graph/builder.rs | 2 +- rust/lance-index/src/vector/hnsw.rs | 2 +- rust/lance-index/src/vector/hnsw/builder.rs | 12 +- rust/lance-index/src/vector/hnsw/index.rs | 2 +- rust/lance-index/src/vector/ivf/storage.rs | 6 +- rust/lance-index/src/vector/pq.rs | 4 +- rust/lance-index/src/vector/pq/storage.rs | 18 +- rust/lance-index/src/vector/quantizer.rs | 4 +- rust/lance-index/src/vector/sq.rs | 4 +- rust/lance-index/src/vector/sq/storage.rs | 10 +- rust/lance-index/src/vector/storage.rs | 10 +- rust/lance-index/src/vector/v3/subindex.rs | 2 +- rust/lance-io/Cargo.toml | 1 - rust/lance-io/src/local.rs | 4 +- rust/lance-io/src/object_reader.rs | 6 +- rust/lance-io/src/object_store.rs | 4 +- rust/lance-io/src/scheduler.rs | 4 +- rust/lance-io/src/traits.rs | 2 +- rust/lance-io/src/uring/current_thread.rs | 4 +- rust/lance-io/src/uring/reader.rs | 4 +- rust/lance-io/src/utils.rs | 2 +- rust/lance-linalg/Cargo.toml | 1 - rust/lance-linalg/src/distance.rs | 2 +- rust/lance-linalg/src/distance/l2.rs | 2 +- rust/lance-select/Cargo.toml | 1 - rust/lance-select/src/mask.rs | 6 +- rust/lance-select/src/mask/nullable.rs | 2 +- rust/lance-table/Cargo.toml | 1 - rust/lance-table/benches/manifest_intern.rs | 2 +- rust/lance-table/src/format/fragment.rs | 2 +- rust/lance-table/src/format/index.rs | 4 +- rust/lance-table/src/format/manifest.rs | 4 +- rust/lance-table/src/rowids.rs | 2 +- rust/lance-table/src/rowids/bitmap.rs | 2 +- rust/lance-table/src/rowids/encoded_array.rs | 2 +- rust/lance-table/src/rowids/index.rs | 4 +- rust/lance-table/src/rowids/segment.rs | 4 +- rust/lance-table/src/rowids/version.rs | 2 +- .../src/system_index/frag_reuse.rs | 2 +- rust/lance-table/src/system_index/mem_wal.rs | 6 +- rust/lance/Cargo.toml | 1 - rust/lance/src/dataset.rs | 2 +- rust/lance/src/dataset/transaction.rs | 6 +- .../write/merge_insert/inserted_rows.rs | 2 +- rust/lance/src/index.rs | 2 +- rust/lance/src/index/scalar_logical.rs | 2 +- rust/lance/src/index/vector/fixture_test.rs | 5 +- rust/lance/src/index/vector/ivf.rs | 4 +- rust/lance/src/index/vector/ivf/v2.rs | 12 +- rust/lance/src/index/vector/pq.rs | 20 +- rust/lance/src/io/exec/knn.rs | 2 +- rust/lance/src/session.rs | 4 +- rust/lance/src/session/caches.rs | 2 +- rust/lance/src/session/index_caches.rs | 2 +- rust/lance/src/session/index_extension.rs | 8 +- 99 files changed, 883 insertions(+), 333 deletions(-) create mode 100644 rust/lance-core/src/deepsize.rs create mode 100644 rust/lance-derive/Cargo.toml create mode 100644 rust/lance-derive/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 603091c2f5b..6de46011f6e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2702,26 +2702,6 @@ dependencies = [ "uuid", ] -[[package]] -name = "deepsize" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cdb987ec36f6bf7bfbea3f928b75590b736fc42af8e54d97592481351b2b96c" -dependencies = [ - "deepsize_derive", -] - -[[package]] -name = "deepsize_derive" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990101d41f3bc8c1a45641024377ee284ecc338e5ecf3ea0f0e236d897c72796" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "der" version = "0.7.10" @@ -4516,7 +4496,6 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-plan", "datafusion-substrait", - "deepsize", "either", "env_logger", "fst", @@ -4644,16 +4623,17 @@ version = "8.0.0-beta.7" dependencies = [ "arrow-array", "arrow-buffer", + "arrow-data", "arrow-schema", "async-trait", "byteorder", "bytes", "datafusion-common", "datafusion-sql", - "deepsize", "futures", "itertools 0.13.0", "lance-arrow", + "lance-derive", "libc", "libm", "log", @@ -4730,6 +4710,15 @@ dependencies = [ "random_word", ] +[[package]] +name = "lance-derive" +version = "8.0.0-beta.7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "lance-encoding" version = "8.0.0-beta.7" @@ -4817,7 +4806,6 @@ dependencies = [ "bytes", "criterion", "datafusion-common", - "deepsize", "futures", "lance-arrow", "lance-core", @@ -4880,7 +4868,6 @@ dependencies = [ "datafusion-common", "datafusion-expr", "datafusion-physical-expr", - "deepsize", "dirs", "env_logger", "fst", @@ -4952,7 +4939,6 @@ dependencies = [ "bytes", "chrono", "criterion", - "deepsize", "futures", "http 1.4.2", "io-uring", @@ -4991,7 +4977,6 @@ dependencies = [ "arrow-schema", "cc", "criterion", - "deepsize", "half", "lance-arrow", "lance-core", @@ -5098,7 +5083,6 @@ dependencies = [ "byteorder", "bytes", "criterion", - "deepsize", "itertools 0.13.0", "lance-core", "proptest", @@ -5123,7 +5107,6 @@ dependencies = [ "bytes", "chrono", "criterion", - "deepsize", "futures", "lance-arrow", "lance-core", diff --git a/Cargo.toml b/Cargo.toml index e5e834020cb..819ff691255 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,7 @@ members = [ "rust/lance-select", "rust/lance-tokenizer", "rust/lance-table", + "rust/lance-derive", "rust/lance-test-macros", "rust/lance-testing", "rust/lance-tools", @@ -61,6 +62,7 @@ lance-arrow = { version = "=8.0.0-beta.7", path = "./rust/lance-arrow" } lance-core = { version = "=8.0.0-beta.7", path = "./rust/lance-core" } lance-datafusion = { version = "=8.0.0-beta.7", path = "./rust/lance-datafusion" } lance-datagen = { version = "=8.0.0-beta.7", path = "./rust/lance-datagen" } +lance-derive = { version = "=8.0.0-beta.7", path = "./rust/lance-derive" } lance-encoding = { version = "=8.0.0-beta.7", path = "./rust/lance-encoding" } lance-file = { version = "=8.0.0-beta.7", path = "./rust/lance-file" } lance-geo = { version = "=8.0.0-beta.7", path = "./rust/lance-geo" } @@ -138,7 +140,6 @@ datafusion-ffi = "53.0.0" datafusion-physical-expr = "53.0.0" datafusion-physical-plan = "53.0.0" datafusion-substrait = "53.0.0" -deepsize = "0.2.0" dirs = "6.0.0" either = "1.0" fst = { version = "0.4.7", features = ["levenshtein"] } diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 8860080ca44..954c4ce415b 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -426,7 +426,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -437,7 +437,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -1200,7 +1200,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -1563,7 +1563,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.117", + "syn", ] [[package]] @@ -1574,7 +1574,7 @@ checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" dependencies = [ "darling_core", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -2063,7 +2063,7 @@ checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd" dependencies = [ "datafusion-doc", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -2259,26 +2259,6 @@ dependencies = [ "url", ] -[[package]] -name = "deepsize" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cdb987ec36f6bf7bfbea3f928b75590b736fc42af8e54d97592481351b2b96c" -dependencies = [ - "deepsize_derive", -] - -[[package]] -name = "deepsize_derive" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990101d41f3bc8c1a45641024377ee284ecc338e5ecf3ea0f0e236d897c72796" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "der" version = "0.7.10" @@ -2353,7 +2333,7 @@ checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -2646,7 +2626,7 @@ checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -2894,7 +2874,7 @@ checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -3617,7 +3597,7 @@ checksum = "782d32378dddf207193ac91cefb848ad41abb58195c95168e1291227a0832b47" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -3678,7 +3658,7 @@ dependencies = [ "quote", "rustc_version", "simd_cesu8", - "syn 2.0.117", + "syn", ] [[package]] @@ -3706,7 +3686,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264" dependencies = [ "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -3799,7 +3779,6 @@ dependencies = [ "datafusion-functions", "datafusion-physical-expr", "datafusion-physical-plan", - "deepsize", "either", "fst", "futures", @@ -3899,16 +3878,17 @@ version = "8.0.0-beta.7" dependencies = [ "arrow-array", "arrow-buffer", + "arrow-data", "arrow-schema", "async-trait", "byteorder", "bytes", "datafusion-common", "datafusion-sql", - "deepsize", "futures", "itertools 0.13.0", "lance-arrow", + "lance-derive", "libc", "libm", "log", @@ -3980,6 +3960,15 @@ dependencies = [ "random_word", ] +[[package]] +name = "lance-derive" +version = "8.0.0-beta.7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "lance-encoding" version = "8.0.0-beta.7" @@ -4030,7 +4019,6 @@ dependencies = [ "byteorder", "bytes", "datafusion-common", - "deepsize", "futures", "lance-arrow", "lance-core", @@ -4083,7 +4071,6 @@ dependencies = [ "datafusion-common", "datafusion-expr", "datafusion-physical-expr", - "deepsize", "dirs", "fst", "futures", @@ -4147,7 +4134,6 @@ dependencies = [ "byteorder", "bytes", "chrono", - "deepsize", "futures", "http 1.4.2", "io-uring", @@ -4214,7 +4200,6 @@ dependencies = [ "arrow-buffer", "arrow-schema", "cc", - "deepsize", "half", "lance-arrow", "lance-core", @@ -4287,7 +4272,6 @@ dependencies = [ "arrow-schema", "byteorder", "bytes", - "deepsize", "itertools 0.13.0", "lance-core", "roaring", @@ -4307,7 +4291,6 @@ dependencies = [ "byteorder", "bytes", "chrono", - "deepsize", "futures", "lance-arrow", "lance-core", @@ -4804,7 +4787,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -5442,7 +5425,7 @@ checksum = "c96395f0a926bc13b1c17622aaddda1ecb55d49c8f1bf9777e4d877800a43f8b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -5549,7 +5532,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn 2.0.117", + "syn", ] [[package]] @@ -5595,7 +5578,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.117", + "syn", "tempfile", ] @@ -5609,7 +5592,7 @@ dependencies = [ "itertools 0.14.0", "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -5909,7 +5892,7 @@ checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -6475,7 +6458,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.117", + "syn", ] [[package]] @@ -6567,7 +6550,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -6578,7 +6561,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -6613,7 +6596,7 @@ checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -6625,7 +6608,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.117", + "syn", ] [[package]] @@ -6669,7 +6652,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -6832,7 +6815,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -6897,7 +6880,7 @@ checksum = "a6dd45d8fc1c79299bfbb7190e42ccbbdf6a5f52e4a6ad98d92357ea965bd289" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -6959,7 +6942,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.117", + "syn", ] [[package]] @@ -6982,7 +6965,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.117", + "syn", "typify", "walkdir", ] @@ -6999,17 +6982,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7973cce6668464ea31f176d85b13c7ab3bba2cb3b77a2ed26abd7801688010a" -[[package]] -name = "syn" -version = "1.0.109" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - [[package]] name = "syn" version = "2.0.117" @@ -7038,7 +7010,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -7127,7 +7099,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -7138,7 +7110,7 @@ checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -7261,7 +7233,7 @@ checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -7483,7 +7455,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -7590,7 +7562,7 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.117", + "syn", "thiserror 2.0.18", "unicode-ident", ] @@ -7608,7 +7580,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.117", + "syn", "typify-impl", ] @@ -7832,7 +7804,7 @@ dependencies = [ "bumpalo", "proc-macro2", "quote", - "syn 2.0.117", + "syn", "wasm-bindgen-shared", ] @@ -8040,7 +8012,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -8051,7 +8023,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -8375,7 +8347,7 @@ dependencies = [ "heck", "indexmap 2.14.0", "prettyplease", - "syn 2.0.117", + "syn", "wasm-metadata", "wit-bindgen-core", "wit-component", @@ -8391,7 +8363,7 @@ dependencies = [ "prettyplease", "proc-macro2", "quote", - "syn 2.0.117", + "syn", "wit-bindgen-core", "wit-bindgen-rust", ] @@ -8651,7 +8623,7 @@ checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", "synstructure", ] @@ -8672,7 +8644,7 @@ checksum = "0b631b19d36a892ab55420c92dbc83ccd79274f25be714855d3074aa71cab639" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -8692,7 +8664,7 @@ checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", "synstructure", ] @@ -8734,7 +8706,7 @@ checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] diff --git a/python/Cargo.lock b/python/Cargo.lock index e2df3d18af5..dd002bb9430 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -2569,26 +2569,6 @@ dependencies = [ "url", ] -[[package]] -name = "deepsize" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cdb987ec36f6bf7bfbea3f928b75590b736fc42af8e54d97592481351b2b96c" -dependencies = [ - "deepsize_derive", -] - -[[package]] -name = "deepsize_derive" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990101d41f3bc8c1a45641024377ee284ecc338e5ecf3ea0f0e236d897c72796" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "der" version = "0.7.10" @@ -4166,7 +4146,6 @@ dependencies = [ "datafusion-functions", "datafusion-physical-expr", "datafusion-physical-plan", - "deepsize", "either", "fst", "futures", @@ -4266,16 +4245,17 @@ version = "8.0.0-beta.7" dependencies = [ "arrow-array", "arrow-buffer", + "arrow-data", "arrow-schema", "async-trait", "byteorder", "bytes", "datafusion-common", "datafusion-sql", - "deepsize", "futures", "itertools 0.13.0", "lance-arrow", + "lance-derive", "libc", "libm", "log", @@ -4347,6 +4327,15 @@ dependencies = [ "random_word", ] +[[package]] +name = "lance-derive" +version = "8.0.0-beta.7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "lance-encoding" version = "8.0.0-beta.7" @@ -4397,7 +4386,6 @@ dependencies = [ "byteorder", "bytes", "datafusion-common", - "deepsize", "futures", "lance-arrow", "lance-core", @@ -4450,7 +4438,6 @@ dependencies = [ "datafusion-common", "datafusion-expr", "datafusion-physical-expr", - "deepsize", "dirs", "fst", "futures", @@ -4515,7 +4502,6 @@ dependencies = [ "byteorder", "bytes", "chrono", - "deepsize", "futures", "http 1.4.2", "io-uring", @@ -4546,7 +4532,6 @@ dependencies = [ "arrow-buffer", "arrow-schema", "cc", - "deepsize", "half", "lance-arrow", "lance-core", @@ -4619,7 +4604,6 @@ dependencies = [ "arrow-schema", "byteorder", "bytes", - "deepsize", "itertools 0.13.0", "lance-core", "roaring", @@ -4641,7 +4625,6 @@ dependencies = [ "byteorder", "bytes", "chrono", - "deepsize", "futures", "lance-arrow", "lance-core", diff --git a/rust/lance-core/Cargo.toml b/rust/lance-core/Cargo.toml index 9fb77e376bc..7f956c70430 100644 --- a/rust/lance-core/Cargo.toml +++ b/rust/lance-core/Cargo.toml @@ -14,6 +14,7 @@ rust-version.workspace = true [dependencies] arrow-array.workspace = true arrow-buffer.workspace = true +arrow-data.workspace = true arrow-schema.workspace = true async-trait.workspace = true lance-arrow.workspace = true @@ -21,7 +22,7 @@ byteorder.workspace = true bytes.workspace = true datafusion-common = { workspace = true, optional = true } datafusion-sql = { workspace = true, optional = true } -deepsize.workspace = true +lance-derive.workspace = true futures.workspace = true itertools.workspace = true libc.workspace = true diff --git a/rust/lance-core/src/cache/mod.rs b/rust/lance-core/src/cache/mod.rs index ee6a728ef73..f62837fe3cc 100644 --- a/rust/lance-core/src/cache/mod.rs +++ b/rust/lance-core/src/cache/mod.rs @@ -63,7 +63,7 @@ use futures::{Future, FutureExt}; use crate::Result; -pub use deepsize::{Context, DeepSizeOf}; +pub use crate::deepsize::{Context, DeepSizeOf}; // --------------------------------------------------------------------------- // CacheKey / UnsizedCacheKey — typed key traits for cache users diff --git a/rust/lance-core/src/container/list.rs b/rust/lance-core/src/container/list.rs index 4f1593f4de1..9d8205cb398 100644 --- a/rust/lance-core/src/container/list.rs +++ b/rust/lance-core/src/container/list.rs @@ -3,7 +3,7 @@ use std::collections::LinkedList; -use deepsize::DeepSizeOf; +use crate::deepsize::DeepSizeOf; /// A linked list that grows exponentially. It is used to store a large number of /// elements in a memory-efficient way. The list grows by doubling the capacity of @@ -134,7 +134,7 @@ impl ExpLinkedList { } impl DeepSizeOf for ExpLinkedList { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut crate::deepsize::Context) -> usize { self.inner .iter() .map(|v| v.deep_size_of_children(context)) diff --git a/rust/lance-core/src/datatypes.rs b/rust/lance-core/src/datatypes.rs index 026e6b0bbe9..628f9cf9a90 100644 --- a/rust/lance-core/src/datatypes.rs +++ b/rust/lance-core/src/datatypes.rs @@ -7,9 +7,9 @@ use std::collections::HashMap; use std::fmt::{self, Debug, Formatter}; use std::sync::{Arc, LazyLock}; +use crate::deepsize::DeepSizeOf; use arrow_array::ArrayRef; use arrow_schema::{DataType, Field as ArrowField, Fields, TimeUnit}; -use deepsize::DeepSizeOf; use lance_arrow::bfloat16::{BFLOAT16_EXT_NAME, is_bfloat16_field}; use lance_arrow::{ARROW_EXT_META_KEY, ARROW_EXT_NAME_KEY}; @@ -408,10 +408,10 @@ pub struct Dictionary { } impl DeepSizeOf for Dictionary { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut crate::deepsize::Context) -> usize { self.values .as_ref() - .map(|v| v.get_array_memory_size()) + .map(|v| (v.as_ref() as &dyn arrow_array::Array).deep_size_of_children(context)) .unwrap_or(0) } } diff --git a/rust/lance-core/src/datatypes/field.rs b/rust/lance-core/src/datatypes/field.rs index b122ce64ac4..4c2665a3640 100644 --- a/rust/lance-core/src/datatypes/field.rs +++ b/rust/lance-core/src/datatypes/field.rs @@ -10,6 +10,7 @@ use std::{ sync::Arc, }; +use crate::deepsize::DeepSizeOf; use arrow_array::{ ArrayRef, cast::AsArray, @@ -18,7 +19,6 @@ use arrow_array::{ }, }; use arrow_schema::{DataType, Field as ArrowField}; -use deepsize::DeepSizeOf; use lance_arrow::{ ARROW_EXT_NAME_KEY, BLOB_META_KEY, BLOB_V2_EXT_NAME, DataTypeExt, json::{is_arrow_json_field, is_json_field}, diff --git a/rust/lance-core/src/datatypes/schema.rs b/rust/lance-core/src/datatypes/schema.rs index 9502f1e45a8..f959c37672f 100644 --- a/rust/lance-core/src/datatypes/schema.rs +++ b/rust/lance-core/src/datatypes/schema.rs @@ -9,9 +9,9 @@ use std::{ sync::Arc, }; +use crate::deepsize::DeepSizeOf; use arrow_array::RecordBatch; use arrow_schema::{Field as ArrowField, Schema as ArrowSchema}; -use deepsize::DeepSizeOf; use lance_arrow::*; use super::field::{Field, OnTypeMismatch, SchemaCompareOptions}; diff --git a/rust/lance-core/src/deepsize.rs b/rust/lance-core/src/deepsize.rs new file mode 100644 index 00000000000..b6c145bb504 --- /dev/null +++ b/rust/lance-core/src/deepsize.rs @@ -0,0 +1,457 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +pub use lance_derive::DeepSizeOf; + +use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; +use std::mem::{size_of, size_of_val}; +use std::sync::atomic::{AtomicU64, AtomicUsize}; +use std::sync::{Arc, Mutex, RwLock}; + +use arrow_array::{Array, RecordBatch}; +use arrow_buffer::ArrowNativeType; +use arrow_data::ArrayData; + +pub struct Context { + seen: HashSet, +} + +impl Default for Context { + fn default() -> Self { + Self::new() + } +} + +impl Context { + pub fn new() -> Self { + Self { + seen: HashSet::new(), + } + } + + /// Returns true if this pointer was NOT previously seen (i.e., it's new). + pub fn mark_seen(&mut self, ptr: usize) -> bool { + self.seen.insert(ptr) + } +} + +pub trait DeepSizeOf { + fn deep_size_of(&self) -> usize { + size_of_val(self) + self.deep_size_of_children(&mut Context::new()) + } + + fn deep_size_of_children(&self, context: &mut Context) -> usize; +} + +// Primitives — no heap children +macro_rules! impl_deep_size_primitive { + ($($t:ty),*) => { + $( + impl DeepSizeOf for $t { + fn deep_size_of_children(&self, _context: &mut Context) -> usize { + 0 + } + } + )* + }; +} + +impl_deep_size_primitive!( + u8, + u16, + u32, + u64, + u128, + usize, + i8, + i16, + i32, + i64, + i128, + isize, + f32, + f64, + bool, + () +); + +impl DeepSizeOf for str { + fn deep_size_of_children(&self, _context: &mut Context) -> usize { + 0 + } +} + +impl DeepSizeOf for String { + fn deep_size_of_children(&self, _context: &mut Context) -> usize { + self.capacity() + } +} + +impl DeepSizeOf for AtomicU64 { + fn deep_size_of_children(&self, _context: &mut Context) -> usize { + 0 + } +} + +impl DeepSizeOf for AtomicUsize { + fn deep_size_of_children(&self, _context: &mut Context) -> usize { + 0 + } +} + +impl DeepSizeOf for [T; N] { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + self.iter() + .map(|item| item.deep_size_of_children(context)) + .sum() + } +} + +impl DeepSizeOf for [T] { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + // The slice's own element bytes are accounted for by the owner (e.g. the + // `size_of_val` in the `Arc`/`Box` impls); here we only sum the heap + // children of each element. + self.iter() + .map(|item| item.deep_size_of_children(context)) + .sum() + } +} + +impl DeepSizeOf for RwLock { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + self.read() + .map(|val| val.deep_size_of_children(context)) + .unwrap_or(0) + } +} + +impl DeepSizeOf for Mutex { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + self.lock() + .map(|val| val.deep_size_of_children(context)) + .unwrap_or(0) + } +} + +// Tuples +macro_rules! impl_deep_size_tuple { + ($($name:ident),+) => { + impl<$($name: DeepSizeOf),+> DeepSizeOf for ($($name,)+) { + #[allow(non_snake_case)] + fn deep_size_of_children(&self, context: &mut Context) -> usize { + let ($($name,)+) = self; + 0 $(+ $name.deep_size_of_children(context))+ + } + } + }; +} + +impl_deep_size_tuple!(A, B); +impl_deep_size_tuple!(A, B, C); +impl_deep_size_tuple!(A, B, C, D); +impl_deep_size_tuple!(A, B, C, D, E); +impl_deep_size_tuple!(A, B, C, D, E, F); + +impl DeepSizeOf for Vec { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + self.capacity() * size_of::() + + self + .iter() + .map(|item| item.deep_size_of_children(context)) + .sum::() + } +} + +impl DeepSizeOf for Box { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + size_of_val(&**self) + (**self).deep_size_of_children(context) + } +} + +impl DeepSizeOf for Arc { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + if context.mark_seen(Self::as_ptr(self) as *const () as usize) { + size_of_val(&**self) + (**self).deep_size_of_children(context) + } else { + 0 + } + } +} + +impl DeepSizeOf for Option { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + match self { + Some(val) => val.deep_size_of_children(context), + None => 0, + } + } +} + +impl DeepSizeOf for HashMap { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + // Each bucket holds a key-value pair plus hash metadata (~1 byte control per bucket). + // Robin hood / Swiss table capacity is always a power of 2. + let capacity_bytes = self.capacity() * (size_of::() + size_of::() + 1); + let children: usize = self + .iter() + .map(|(k, v)| k.deep_size_of_children(context) + v.deep_size_of_children(context)) + .sum(); + capacity_bytes + children + } +} + +impl DeepSizeOf for HashSet { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + let capacity_bytes = self.capacity() * (size_of::() + 1); + let children: usize = self.iter().map(|k| k.deep_size_of_children(context)).sum(); + capacity_bytes + children + } +} + +impl DeepSizeOf for BTreeMap { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + // BTreeMap nodes have ~11 entries each. Rough estimate: per-entry overhead ~3 pointers. + let per_entry = size_of::() + size_of::() + 3 * size_of::(); + let overhead = self.len() * per_entry; + let children: usize = self + .iter() + .map(|(k, v)| k.deep_size_of_children(context) + v.deep_size_of_children(context)) + .sum(); + overhead + children + } +} + +impl DeepSizeOf for BTreeSet { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + let per_entry = size_of::() + 3 * size_of::(); + let overhead = self.len() * per_entry; + let children: usize = self.iter().map(|k| k.deep_size_of_children(context)).sum(); + overhead + children + } +} + +// Arrow types + +fn record_array_data(context: &mut Context, data: &ArrayData) -> usize { + let mut total = 0; + for buffer in data.buffers() { + if context.mark_seen(buffer.as_ptr() as usize) { + total += buffer.capacity(); + } + } + if let Some(nulls) = data.nulls() { + let null_buf = nulls.inner().inner(); + if context.mark_seen(null_buf.as_ptr() as usize) { + total += null_buf.capacity(); + } + } + for child in data.child_data() { + total += record_array_data(context, child); + } + total +} + +impl DeepSizeOf for dyn Array { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + // `to_data()` only clones Arc refs (no data copy) and allocates a small + // ArrayData metadata struct. This lets us walk buffer pointers for dedup. + // Cost is O(number_of_buffers), not O(data_size). + let data = self.to_data(); + record_array_data(context, &data) + } +} + +impl DeepSizeOf for RecordBatch { + fn deep_size_of_children(&self, context: &mut Context) -> usize { + self.columns() + .iter() + .map(|col| col.deep_size_of_children(context)) + .sum() + } +} + +impl DeepSizeOf for arrow_buffer::ScalarBuffer +where + T: ArrowNativeType, +{ + fn deep_size_of_children(&self, context: &mut Context) -> usize { + // Track the underlying buffer pointer to avoid double-counting shared allocations. + // Use capacity() rather than len() * size_of::() because sliced buffers retain + // their full original allocation. + let buf = self.inner(); + if context.mark_seen(buf.as_ptr() as usize) { + buf.capacity() + } else { + 0 + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::{Int32Array, StringArray, StructArray}; + use arrow_schema::{DataType, Field, Fields, Schema}; + + #[test] + fn test_basic_record_batch() { + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + + let size = batch.deep_size_of(); + // Should at least include the buffer for 3 i32s + assert!(size >= 3 * size_of::()); + } + + #[test] + fn test_same_batch_dedup() { + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])), + vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))], + ) + .unwrap(); + + let mut ctx = Context::new(); + let size_a = batch.deep_size_of_children(&mut ctx); + let size_b = batch.deep_size_of_children(&mut ctx); + + // First measurement should report buffer sizes + assert!(size_a > 0); + // Second measurement of the same batch should add nothing (buffers already seen) + assert_eq!(size_b, 0); + } + + #[test] + fn test_arc_dedup() { + let batch = Arc::new( + RecordBatch::try_new( + Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(), + ); + let clone = Arc::clone(&batch); + + let mut ctx = Context::new(); + let size_a = batch.deep_size_of_children(&mut ctx); + let size_b = clone.deep_size_of_children(&mut ctx); + + assert!(size_a > 0); + assert_eq!(size_b, 0); + } + + #[test] + fn test_multi_column_shared_array() { + // Two columns pointing to the same Arc + let array: Arc = Arc::new(Int32Array::from(vec![10, 20, 30])); + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + ])); + + // Single-column batch for reference + let one_col = RecordBatch::try_new( + Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])), + vec![array.clone()], + ) + .unwrap(); + + // Two-column batch with the same Arc shared + let two_col = RecordBatch::try_new(schema, vec![array.clone(), array]).unwrap(); + + let mut ctx1 = Context::new(); + let size_one = one_col.deep_size_of_children(&mut ctx1); + + let mut ctx2 = Context::new(); + let size_two = two_col.deep_size_of_children(&mut ctx2); + + // Both should report the same size since the second column's Arc is + // already seen and contributes nothing + assert_eq!(size_one, size_two); + } + + #[test] + fn test_nested_struct_array() { + let int_array = Int32Array::from(vec![1, 2, 3]); + let str_array = StringArray::from(vec!["a", "b", "c"]); + let struct_array = StructArray::from(vec![ + ( + Arc::new(Field::new("x", DataType::Int32, false)), + Arc::new(int_array) as Arc, + ), + ( + Arc::new(Field::new("y", DataType::Utf8, false)), + Arc::new(str_array) as Arc, + ), + ]); + + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![Field::new( + "s", + DataType::Struct(Fields::from(vec![ + Field::new("x", DataType::Int32, false), + Field::new("y", DataType::Utf8, false), + ])), + false, + )])), + vec![Arc::new(struct_array)], + ) + .unwrap(); + + let size = batch.deep_size_of(); + // Should include buffers for both child arrays + assert!(size > 3 * size_of::()); + } + + #[test] + fn test_std_types() { + assert_eq!(42u32.deep_size_of(), size_of::()); + + let s = String::from("hello"); + assert!(s.deep_size_of() >= size_of::() + 5); + + let v = vec![1u32, 2, 3]; + assert!(v.deep_size_of() >= size_of::>() + 3 * size_of::()); + + let a = Arc::new(42u32); + let b = Arc::clone(&a); + let mut ctx = Context::new(); + let size_a = a.deep_size_of_children(&mut ctx); + let size_b = b.deep_size_of_children(&mut ctx); + assert_eq!(size_a, size_of::()); + assert_eq!(size_b, 0); + } + + #[test] + fn test_derive_macro() { + use lance_derive::DeepSizeOf; + + #[derive(DeepSizeOf)] + struct Outer { + count: u64, + label: String, + inner: Inner, + } + + #[derive(DeepSizeOf)] + struct Inner { + values: Vec, + } + + let val = Outer { + count: 7, + label: String::from("hello"), + inner: Inner { + values: vec![1, 2, 3], + }, + }; + + let size = val.deep_size_of(); + // Must be at least the stack size + heap allocations for label + values + assert!(size >= std::mem::size_of::() + 5 + 3 * std::mem::size_of::()); + } +} diff --git a/rust/lance-core/src/lib.rs b/rust/lance-core/src/lib.rs index 173c7d0ceaa..8379fa74c4d 100644 --- a/rust/lance-core/src/lib.rs +++ b/rust/lance-core/src/lib.rs @@ -2,12 +2,16 @@ // SPDX-FileCopyrightText: Copyright The Lance Authors #![cfg_attr(coverage, feature(coverage_attribute))] +// Allow the derive macro to reference `lance_core::deepsize` from within this crate. +extern crate self as lance_core; + use arrow_schema::{DataType, Field as ArrowField}; use std::sync::LazyLock; pub mod cache; pub mod container; pub mod datatypes; +pub mod deepsize; pub mod error; pub mod levenshtein; pub mod traits; diff --git a/rust/lance-core/src/utils/deletion.rs b/rust/lance-core/src/utils/deletion.rs index 5ddfc3348e5..c7f8b142464 100644 --- a/rust/lance-core/src/utils/deletion.rs +++ b/rust/lance-core/src/utils/deletion.rs @@ -3,8 +3,8 @@ use std::{collections::HashSet, ops::Range, sync::Arc}; +use crate::deepsize::{Context, DeepSizeOf}; use arrow_array::BooleanArray; -use deepsize::{Context, DeepSizeOf}; use roaring::RoaringBitmap; /// Threshold for when a DeletionVector::Set should be promoted to a DeletionVector::Bitmap. @@ -296,7 +296,7 @@ impl From for DeletionVector { #[cfg_attr(coverage, coverage(off))] mod test { use super::*; - use deepsize::DeepSizeOf; + use crate::deepsize::DeepSizeOf; use rstest::rstest; fn set_dv(vals: impl IntoIterator) -> DeletionVector { diff --git a/rust/lance-derive/Cargo.toml b/rust/lance-derive/Cargo.toml new file mode 100644 index 00000000000..4bb99d3ac93 --- /dev/null +++ b/rust/lance-derive/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "lance-derive" +version.workspace = true +edition.workspace = true +authors.workspace = true +license.workspace = true +repository.workspace = true +readme.workspace = true +description = "Derive macros for Lance" +keywords.workspace = true +categories.workspace = true + +[lib] +proc-macro = true + +[dependencies] +proc-macro2 = "1.0.67" +quote = "1.0.33" +syn = { version = "2.0.37", features = ["full"] } + +[lints] +workspace = true diff --git a/rust/lance-derive/src/lib.rs b/rust/lance-derive/src/lib.rs new file mode 100644 index 00000000000..d0486133ddc --- /dev/null +++ b/rust/lance-derive/src/lib.rs @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use proc_macro::TokenStream; +use quote::quote; +use syn::{Data, DeriveInput, Fields, parse_macro_input}; + +/// Derive macro for the `DeepSizeOf` trait. +/// +/// Generates an implementation that sums the `deep_size_of_children` of all +/// fields (for structs) or the active variant's fields (for enums). +#[proc_macro_derive(DeepSizeOf)] +pub fn derive_deep_size_of(input: TokenStream) -> TokenStream { + let input = parse_macro_input!(input as DeriveInput); + let name = &input.ident; + let generics = &input.generics; + + // Add DeepSizeOf bounds to all type parameters + let mut bounded_generics = generics.clone(); + for param in &mut bounded_generics.params { + if let syn::GenericParam::Type(ref mut type_param) = *param { + type_param + .bounds + .push(syn::parse_quote!(lance_core::deepsize::DeepSizeOf)); + } + } + let (impl_generics, _, where_clause) = bounded_generics.split_for_impl(); + let (_, ty_generics, _) = generics.split_for_impl(); + + let body = match &input.data { + Data::Struct(data) => generate_struct_body(&data.fields), + Data::Enum(data) => { + let arms: Vec<_> = data + .variants + .iter() + .map(|variant| { + let variant_ident = &variant.ident; + match &variant.fields { + Fields::Unit => { + quote! { Self::#variant_ident => 0 } + } + Fields::Unnamed(fields) => { + let bindings: Vec<_> = (0..fields.unnamed.len()) + .map(|i| { + syn::Ident::new( + &format!("__field_{}", i), + proc_macro2::Span::call_site(), + ) + }) + .collect(); + let sum = bindings.iter().map(|b| { + quote! { lance_core::deepsize::DeepSizeOf::deep_size_of_children(#b, __context) } + }); + quote! { + Self::#variant_ident(#(#bindings),*) => { + 0 #(+ #sum)* + } + } + } + Fields::Named(fields) => { + let field_names: Vec<_> = + fields.named.iter().map(|f| &f.ident).collect(); + let sum = field_names.iter().map(|f| { + quote! { lance_core::deepsize::DeepSizeOf::deep_size_of_children(#f, __context) } + }); + quote! { + Self::#variant_ident { #(#field_names),* } => { + 0 #(+ #sum)* + } + } + } + } + }) + .collect(); + quote! { + match self { + #(#arms),* + } + } + } + Data::Union(_) => { + return syn::Error::new_spanned(&input, "DeepSizeOf cannot be derived for unions") + .to_compile_error() + .into(); + } + }; + + let expanded = quote! { + impl #impl_generics lance_core::deepsize::DeepSizeOf for #name #ty_generics #where_clause { + fn deep_size_of_children(&self, __context: &mut lance_core::deepsize::Context) -> usize { + #body + } + } + }; + + TokenStream::from(expanded) +} + +fn generate_struct_body(fields: &Fields) -> proc_macro2::TokenStream { + match fields { + Fields::Named(fields) => { + let field_sizes = fields.named.iter().map(|f| { + let name = &f.ident; + quote! { lance_core::deepsize::DeepSizeOf::deep_size_of_children(&self.#name, __context) } + }); + quote! { 0 #(+ #field_sizes)* } + } + Fields::Unnamed(fields) => { + let field_sizes = (0..fields.unnamed.len()).map(|i| { + let index = syn::Index::from(i); + quote! { lance_core::deepsize::DeepSizeOf::deep_size_of_children(&self.#index, __context) } + }); + quote! { 0 #(+ #field_sizes)* } + } + Fields::Unit => { + quote! { 0 } + } + } +} diff --git a/rust/lance-encoding/src/encodings/logical/primitive/blob.rs b/rust/lance-encoding/src/encodings/logical/primitive/blob.rs index 614dcb81ac2..eed3e584b7e 100644 --- a/rust/lance-encoding/src/encodings/logical/primitive/blob.rs +++ b/rust/lance-encoding/src/encodings/logical/primitive/blob.rs @@ -205,9 +205,9 @@ struct BlobCacheableState { } impl DeepSizeOf for BlobCacheableState { - fn deep_size_of_children(&self, context: &mut lance_core::cache::Context) -> usize { - self.positions.get_array_memory_size() - + self.sizes.get_array_memory_size() + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { + (self.positions.as_ref() as &dyn arrow_array::Array).deep_size_of_children(context) + + (self.sizes.as_ref() as &dyn arrow_array::Array).deep_size_of_children(context) + self.inner_state.deep_size_of_children(context) } } diff --git a/rust/lance-file/Cargo.toml b/rust/lance-file/Cargo.toml index c79ffcdb57c..f08cd3457aa 100644 --- a/rust/lance-file/Cargo.toml +++ b/rust/lance-file/Cargo.toml @@ -27,7 +27,6 @@ async-trait.workspace = true byteorder.workspace = true bytes.workspace = true datafusion-common.workspace = true -deepsize.workspace = true futures.workspace = true log.workspace = true num-traits.workspace = true diff --git a/rust/lance-file/src/previous/format/metadata.rs b/rust/lance-file/src/previous/format/metadata.rs index 7e4046be893..11ba00c3243 100644 --- a/rust/lance-file/src/previous/format/metadata.rs +++ b/rust/lance-file/src/previous/format/metadata.rs @@ -6,8 +6,8 @@ use std::ops::Range; use crate::datatypes::{Fields, FieldsWithMeta}; use crate::format::pb; -use deepsize::DeepSizeOf; use lance_core::datatypes::Schema; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; use lance_io::traits::ProtoStruct; diff --git a/rust/lance-file/src/previous/page_table.rs b/rust/lance-file/src/previous/page_table.rs index 9a3c0d71437..cc246caa585 100644 --- a/rust/lance-file/src/previous/page_table.rs +++ b/rust/lance-file/src/previous/page_table.rs @@ -4,7 +4,7 @@ use arrow_array::builder::Int64Builder; use arrow_array::{Array, Int64Array}; use arrow_schema::DataType; -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; use lance_io::encodings::Decoder; use lance_io::encodings::plain::PlainDecoder; use std::collections::BTreeMap; diff --git a/rust/lance-file/src/previous/reader.rs b/rust/lance-file/src/previous/reader.rs index cf30d30a547..1ab861985e1 100644 --- a/rust/lance-file/src/previous/reader.rs +++ b/rust/lance-file/src/previous/reader.rs @@ -19,11 +19,11 @@ use arrow_buffer::ArrowNativeType; use arrow_schema::{DataType, FieldRef, Schema as ArrowSchema}; use arrow_select::concat::{self, concat_batches}; use async_recursion::async_recursion; -use deepsize::DeepSizeOf; use futures::{Future, FutureExt, StreamExt, TryStreamExt, stream}; use lance_arrow::*; use lance_core::cache::{CacheKey, LanceCache}; use lance_core::datatypes::{Field, Schema}; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; use lance_io::encodings::AsyncIndex; use lance_io::encodings::dictionary::DictionaryDecoder; diff --git a/rust/lance-file/src/reader.rs b/rust/lance-file/src/reader.rs index 3d1d8e7c361..9e4e4c449a4 100644 --- a/rust/lance-file/src/reader.rs +++ b/rust/lance-file/src/reader.rs @@ -13,8 +13,8 @@ use arrow_array::RecordBatchReader; use arrow_schema::Schema as ArrowSchema; use byteorder::{ByteOrder, LittleEndian, ReadBytesExt}; use bytes::{Bytes, BytesMut}; -use deepsize::{Context, DeepSizeOf}; use futures::{Stream, StreamExt, stream::BoxStream}; +use lance_core::deepsize::{Context, DeepSizeOf}; use lance_encoding::{ EncodingsIo, decoder::{ @@ -2511,7 +2511,7 @@ mod tests { // column_metadatas and column_infos, otherwise the moka cache weigher // dramatically underestimates entry sizes and never evicts, causing // unbounded memory growth on random-access workloads. - use deepsize::DeepSizeOf; + use lance_core::deepsize::DeepSizeOf; let fs = FsFixture::default(); let _written = create_some_file(&fs, version).await; diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml index b2041257161..e3947b57856 100644 --- a/rust/lance-index/Cargo.toml +++ b/rust/lance-index/Cargo.toml @@ -26,7 +26,6 @@ datafusion-common.workspace = true datafusion-expr.workspace = true datafusion-physical-expr.workspace = true datafusion.workspace = true -deepsize.workspace = true dirs.workspace = true fst.workspace = true futures.workspace = true diff --git a/rust/lance-index/src/lib.rs b/rust/lance-index/src/lib.rs index fee421bab08..a1a4f8c4a43 100644 --- a/rust/lance-index/src/lib.rs +++ b/rust/lance-index/src/lib.rs @@ -14,7 +14,7 @@ use std::{any::Any, sync::Arc}; use crate::frag_reuse::FRAG_REUSE_INDEX_NAME; use crate::mem_wal::MEM_WAL_INDEX_NAME; use async_trait::async_trait; -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs index d0378b389c8..20d09abab00 100644 --- a/rust/lance-index/src/scalar.rs +++ b/rust/lance-index/src/scalar.rs @@ -18,8 +18,8 @@ use std::pin::Pin; use std::{any::Any, ops::Bound, sync::Arc}; use datafusion_expr::{Expr, expr::ScalarFunction}; -use deepsize::DeepSizeOf; use inverted::query::{FtsQuery, FtsQueryNode, FtsSearchParams, MatchQuery, fill_fts_query_column}; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; use lance_io::stream::{RecordBatchStream, RecordBatchStreamAdapter}; use lance_select::{NullableRowAddrSet, RowAddrTreeMap, RowSetOps}; diff --git a/rust/lance-index/src/scalar/bitmap.rs b/rust/lance-index/src/scalar/bitmap.rs index 2436b642af7..1ae2faf6e6b 100644 --- a/rust/lance-index/src/scalar/bitmap.rs +++ b/rust/lance-index/src/scalar/bitmap.rs @@ -17,12 +17,12 @@ use async_trait::async_trait; use bytes::Bytes; use datafusion::physical_plan::SendableRecordBatchStream; use datafusion_common::ScalarValue; -use deepsize::DeepSizeOf; use futures::{StreamExt, TryStreamExt, stream}; use lance_arrow::ipc::{ read_ipc_stream_single_at, read_len_prefixed_bytes_at, write_ipc_stream, write_len_prefixed_bytes, }; +use lance_core::deepsize::DeepSizeOf; use lance_core::{ Error, ROW_ID, Result, cache::{CacheCodec, CacheCodecImpl, CacheKey, LanceCache, WeakLanceCache}, @@ -180,7 +180,7 @@ pub struct BitmapIndexState { } impl DeepSizeOf for BitmapIndexState { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.lookup_batch.get_array_memory_size() + self.null_map.deep_size_of_children(context) + self.index_map.deep_size_of_children(context) @@ -473,7 +473,7 @@ impl BitmapIndex { } impl DeepSizeOf for BitmapIndex { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.index_map.deep_size_of_children(context) + self.store.deep_size_of_children(context) } } diff --git a/rust/lance-index/src/scalar/bloomfilter.rs b/rust/lance-index/src/scalar/bloomfilter.rs index bb8b82a4a4d..856f08af772 100644 --- a/rust/lance-index/src/scalar/bloomfilter.rs +++ b/rust/lance-index/src/scalar/bloomfilter.rs @@ -33,10 +33,10 @@ use crate::vector::VectorIndex; use crate::{Index, IndexType}; use arrow_array::{ArrayRef, RecordBatch}; use async_trait::async_trait; -use deepsize::DeepSizeOf; use lance_core::Error; use lance_core::Result; use lance_core::cache::LanceCache; +use lance_core::deepsize::DeepSizeOf; use roaring::RoaringBitmap; use super::zoned::{ZoneBound, ZoneProcessor, ZoneTrainer, rebuild_zones, search_zones}; @@ -58,7 +58,7 @@ struct BloomFilterStatistics { } impl DeepSizeOf for BloomFilterStatistics { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { // Estimate the size of the bloom filter // We could try to get the actual size from the Sbbf if it has a method for that, // but for now we'll estimate based on the number of bytes it serializes to @@ -82,7 +82,7 @@ pub struct BloomFilterIndex { } impl DeepSizeOf for BloomFilterIndex { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.zones.deep_size_of_children(context) } } diff --git a/rust/lance-index/src/scalar/btree.rs b/rust/lance-index/src/scalar/btree.rs index 5a32e8b15ca..69f75aeb0e9 100644 --- a/rust/lance-index/src/scalar/btree.rs +++ b/rust/lance-index/src/scalar/btree.rs @@ -38,13 +38,13 @@ use datafusion::physical_plan::{ }; use datafusion_common::{DataFusionError, ScalarValue}; use datafusion_physical_expr::{PhysicalSortExpr, expressions::Column}; -use deepsize::DeepSizeOf; use futures::{ FutureExt, Stream, StreamExt, TryFutureExt, TryStreamExt, future::BoxFuture, stream::{self}, }; use lance_arrow::ipc::{read_ipc_stream_single_at, write_ipc_stream}; +use lance_core::deepsize::DeepSizeOf; use lance_core::{ Error, ROW_ID, Result, cache::{CacheCodec, CacheCodecImpl, CacheKey, LanceCache, WeakLanceCache}, @@ -84,7 +84,7 @@ pub(crate) const BTREE_IDS_COLUMN: &str = "ids"; pub struct OrderableScalarValue(pub ScalarValue); impl DeepSizeOf for OrderableScalarValue { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { // deepsize and size both factor in the size of the ScalarValue self.0.size() - std::mem::size_of::() } @@ -1020,7 +1020,7 @@ pub struct BTreeIndexState { } impl DeepSizeOf for BTreeIndexState { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { // `ranges_to_files` is tiny and `RangeInclusiveMap` is not `DeepSizeOf`; // the lookup batch dominates, matching how `BTreeIndex` accounts for itself. self.lookup_batch.get_array_memory_size() @@ -1221,7 +1221,7 @@ pub struct BTreeIndex { } impl DeepSizeOf for BTreeIndex { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { // We don't include the index cache, or anything stored in it. For example: // sub_index and fri. self.page_lookup.deep_size_of_children(context) @@ -2965,10 +2965,10 @@ mod tests { }; use datafusion_common::{DataFusionError, ScalarValue}; use datafusion_physical_expr::{PhysicalSortExpr, expressions::col}; - use deepsize::DeepSizeOf; use futures::TryStreamExt; use futures::stream; use lance_core::cache::LanceCache; + use lance_core::deepsize::DeepSizeOf; use lance_core::utils::tempfile::TempObjDir; use lance_datafusion::{chunker::break_stream, datagen::DatafusionDatagenExt}; use lance_datagen::{ArrayGeneratorExt, BatchCount, RowCount, array, gen_batch}; diff --git a/rust/lance-index/src/scalar/btree/flat.rs b/rust/lance-index/src/scalar/btree/flat.rs index 4240753772b..212ef6490be 100644 --- a/rust/lance-index/src/scalar/btree/flat.rs +++ b/rust/lance-index/src/scalar/btree/flat.rs @@ -12,11 +12,11 @@ use arrow_array::{ use datafusion_common::DFSchema; use datafusion_expr::execution_props::ExecutionProps; use datafusion_physical_expr::create_physical_expr; -use deepsize::DeepSizeOf; use lance_arrow::RecordBatchExt; use lance_arrow::ipc::{read_ipc_stream_single_at, read_len_prefixed_bytes_at, write_ipc_stream}; use lance_core::Result; use lance_core::cache::CacheCodecImpl; +use lance_core::deepsize::DeepSizeOf; use lance_core::utils::address::RowAddress; use lance_select::{NullableRowAddrSet, RowAddrTreeMap, RowSetOps}; use roaring::RoaringBitmap; @@ -43,7 +43,7 @@ pub struct FlatIndex { } impl DeepSizeOf for FlatIndex { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { self.data.get_array_memory_size() } } diff --git a/rust/lance-index/src/scalar/fmindex.rs b/rust/lance-index/src/scalar/fmindex.rs index c5a43d691f7..af7e67950a1 100644 --- a/rust/lance-index/src/scalar/fmindex.rs +++ b/rust/lance-index/src/scalar/fmindex.rs @@ -28,9 +28,9 @@ use arrow_array::RecordBatch; use arrow_schema::{DataType, Field}; use async_trait::async_trait; use datafusion::execution::SendableRecordBatchStream; -use deepsize::DeepSizeOf; use futures::StreamExt; use lance_core::cache::LanceCache; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; use roaring::RoaringBitmap; @@ -631,7 +631,7 @@ pub struct FMIndex { } impl DeepSizeOf for FMIndex { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { self.wavelet.deep_size() + self.row_ids.len() * 8 + self.sa_samples.len() * 8 @@ -1139,7 +1139,7 @@ pub struct FMIndexScalarIndex { } impl DeepSizeOf for FMIndexScalarIndex { - fn deep_size_of_children(&self, _ctx: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _ctx: &mut lance_core::deepsize::Context) -> usize { self.partitions.iter().map(|p| p.fm.deep_size()).sum() } } diff --git a/rust/lance-index/src/scalar/inverted/builder.rs b/rust/lance-index/src/scalar/inverted/builder.rs index 283806ed32f..59a992694c9 100644 --- a/rust/lance-index/src/scalar/inverted/builder.rs +++ b/rust/lance-index/src/scalar/inverted/builder.rs @@ -18,12 +18,12 @@ use arrow_schema::{DataType, Field, Schema, SchemaRef}; use bitpacking::{BitPacker, BitPacker4x}; use bytes::Bytes; use datafusion::execution::{RecordBatchStream, SendableRecordBatchStream}; -use deepsize::DeepSizeOf; use fst::Streamer; use futures::{Stream, StreamExt, TryStreamExt}; use lance_arrow::json::JSON_EXT_NAME; use lance_arrow::{ARROW_EXT_NAME_KEY, iter_str_array}; use lance_core::cache::LanceCache; +use lance_core::deepsize::DeepSizeOf; use lance_core::error::LanceOptionExt; use lance_core::utils::tokio::{IO_CORE_RESERVATION, get_num_compute_intensive_cpus, spawn_cpu}; use lance_core::{Error, ROW_ID, ROW_ID_FIELD, Result}; @@ -2241,7 +2241,7 @@ mod tests { } impl DeepSizeOf for CountingStore { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { 0 } } diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index e7ebfec1e82..03462270858 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -35,12 +35,12 @@ use async_trait::async_trait; use datafusion::execution::SendableRecordBatchStream; use datafusion::physical_plan::metrics::Time; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; -use deepsize::DeepSizeOf; use fst::{Automaton, IntoStreamer, Streamer}; use futures::{FutureExt, Stream, StreamExt, TryStreamExt, stream}; use itertools::Itertools; use lance_arrow::{RecordBatchExt, iter_str_array}; use lance_core::cache::{CacheCodec, CacheKey, LanceCache, WeakLanceCache}; +use lance_core::deepsize::DeepSizeOf; use lance_core::error::{DataFusionResult, LanceOptionExt}; use lance_core::utils::tokio::{get_num_compute_intensive_cpus, spawn_cpu}; use lance_core::utils::tracing::{IO_TYPE_LOAD_SCALAR_PART, TRACE_IO_EVENTS}; @@ -251,7 +251,7 @@ impl FromStr for TokenSetFormat { } impl DeepSizeOf for TokenSetFormat { - fn deep_size_of_children(&self, _: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _: &mut lance_core::deepsize::Context) -> usize { 0 } } @@ -371,7 +371,7 @@ impl Debug for InvertedIndex { } impl DeepSizeOf for InvertedIndex { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.partitions.deep_size_of_children(context) } } @@ -1416,7 +1416,7 @@ impl Default for TokenMap { } impl DeepSizeOf for TokenMap { - fn deep_size_of_children(&self, ctx: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, ctx: &mut lance_core::deepsize::Context) -> usize { match self { Self::HashMap(map) => map.deep_size_of_children(ctx), Self::Fst(map) => map.as_fst().size(), @@ -1855,7 +1855,7 @@ impl std::fmt::Debug for PostingListReader { } impl DeepSizeOf for PostingListReader { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { let metadata_size = match &self.metadata { PostingMetadata::LegacyV1 { offsets, @@ -2620,7 +2620,7 @@ fn sliced_cache_bytes(array: &dyn Array) -> usize { } impl DeepSizeOf for Positions { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.0.deep_size_of_children(context) } } @@ -2701,7 +2701,7 @@ pub enum CompressedPositionStorage { } impl DeepSizeOf for CompressedPositionStorage { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { match self { Self::LegacyPerDoc(positions) => sliced_cache_bytes(positions), Self::SharedStream(stream) => stream.size(), @@ -2978,7 +2978,7 @@ pub struct PlainPostingList { } impl DeepSizeOf for PlainPostingList { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { self.row_ids.len() * std::mem::size_of::() + self.frequencies.len() * std::mem::size_of::() + self @@ -3081,7 +3081,7 @@ pub struct CompressedPostingList { } impl DeepSizeOf for CompressedPostingList { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { sliced_cache_bytes(&self.blocks) + self .positions @@ -5919,7 +5919,7 @@ mod tests { } impl DeepSizeOf for CountingStore { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.inner.deep_size_of_children(context) } } diff --git a/rust/lance-index/src/scalar/inverted/lazy_docset.rs b/rust/lance-index/src/scalar/inverted/lazy_docset.rs index 41e1c76473a..7a0ee41efd8 100644 --- a/rust/lance-index/src/scalar/inverted/lazy_docset.rs +++ b/rust/lance-index/src/scalar/inverted/lazy_docset.rs @@ -94,8 +94,8 @@ impl std::fmt::Debug for LazyDocSet { } } -impl deepsize::DeepSizeOf for LazyDocSet { - fn deep_size_of_children(&self, ctx: &mut deepsize::Context) -> usize { +impl lance_core::deepsize::DeepSizeOf for LazyDocSet { + fn deep_size_of_children(&self, ctx: &mut lance_core::deepsize::Context) -> usize { match self { Self::Loaded(l) => l.docs.deep_size_of_children(ctx), Self::Deferred(d) => { diff --git a/rust/lance-index/src/scalar/json.rs b/rust/lance-index/src/scalar/json.rs index 81b5a0b57e3..7adf055db61 100644 --- a/rust/lance-index/src/scalar/json.rs +++ b/rust/lance-index/src/scalar/json.rs @@ -21,8 +21,8 @@ use datafusion_physical_expr::{ PhysicalExpr, ScalarFunctionExpr, expressions::{Column, Literal}, }; -use deepsize::DeepSizeOf; use futures::StreamExt; +use lance_core::deepsize::DeepSizeOf; use lance_datafusion::exec::{LanceExecutionOptions, OneShotExec, get_session_context}; use lance_datafusion::udf::json::JsonbType; use prost::Message; @@ -61,7 +61,7 @@ impl JsonIndex { } impl DeepSizeOf for JsonIndex { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.target_index.deep_size_of_children(context) + self.path.deep_size_of_children(context) } } diff --git a/rust/lance-index/src/scalar/label_list.rs b/rust/lance-index/src/scalar/label_list.rs index 55cd392a1b7..cf357d89585 100644 --- a/rust/lance-index/src/scalar/label_list.rs +++ b/rust/lance-index/src/scalar/label_list.rs @@ -17,10 +17,10 @@ use bytes::Bytes; use datafusion::execution::RecordBatchStream; use datafusion::physical_plan::{SendableRecordBatchStream, stream::RecordBatchStreamAdapter}; use datafusion_common::ScalarValue; -use deepsize::DeepSizeOf; use futures::{StreamExt, TryStream, TryStreamExt, stream::BoxStream}; use lance_arrow::ipc::{read_len_prefixed_bytes_at, write_len_prefixed_bytes}; use lance_core::cache::{CacheCodec, CacheCodecImpl, CacheKey, LanceCache}; +use lance_core::deepsize::DeepSizeOf; use lance_core::error::LanceOptionExt; use lance_core::{Error, ROW_ID, Result}; use lance_select::{NullableRowAddrSet, RowAddrTreeMap, RowSetOps}; @@ -504,7 +504,7 @@ pub struct LabelListIndexState { } impl DeepSizeOf for LabelListIndexState { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.bitmap_state.deep_size_of_children(context) + self.list_nulls.deep_size_of_children(context) } diff --git a/rust/lance-index/src/scalar/lance_format.rs b/rust/lance-index/src/scalar/lance_format.rs index 79faee37e9e..f3ba9eb93b1 100644 --- a/rust/lance-index/src/scalar/lance_format.rs +++ b/rust/lance-index/src/scalar/lance_format.rs @@ -8,8 +8,8 @@ use arrow_array::RecordBatch; use arrow_schema::Schema; use async_trait::async_trait; use bytes::Bytes; -use deepsize::DeepSizeOf; use futures::TryStreamExt; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result, cache::LanceCache}; use lance_encoding::decoder::{DecoderPlugins, FilterExpression}; use lance_encoding::version::LanceFileVersion; @@ -48,7 +48,7 @@ pub struct LanceIndexStore { } impl DeepSizeOf for LanceIndexStore { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.object_store.deep_size_of_children(context) + self.index_dir.as_ref().deep_size_of_children(context) + self.metadata_cache.deep_size_of_children(context) diff --git a/rust/lance-index/src/scalar/ngram.rs b/rust/lance-index/src/scalar/ngram.rs index cab1f37ee8d..72ef8d53a92 100644 --- a/rust/lance-index/src/scalar/ngram.rs +++ b/rust/lance-index/src/scalar/ngram.rs @@ -29,10 +29,10 @@ use arrow_array::{BinaryArray, RecordBatch, UInt32Array}; use arrow_schema::{DataType, Field, Schema, SchemaRef}; use async_trait::async_trait; use datafusion::execution::SendableRecordBatchStream; -use deepsize::DeepSizeOf; use futures::{FutureExt, Stream, StreamExt, TryStreamExt, stream}; use lance_arrow::iter_str_array; use lance_core::cache::{CacheKey, LanceCache, WeakLanceCache}; +use lance_core::deepsize::DeepSizeOf; use lance_core::error::LanceOptionExt; use lance_core::utils::address::RowAddress; use lance_core::utils::tempfile::TempDir; @@ -155,7 +155,7 @@ pub struct NGramPostingList { } impl DeepSizeOf for NGramPostingList { - fn deep_size_of_children(&self, _: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _: &mut lance_core::deepsize::Context) -> usize { self.bitmap.serialized_size() } } @@ -213,7 +213,7 @@ struct NGramPostingListReader { } impl DeepSizeOf for NGramPostingListReader { - fn deep_size_of_children(&self, _: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _: &mut lance_core::deepsize::Context) -> usize { 0 } } @@ -285,7 +285,7 @@ impl std::fmt::Debug for NGramIndex { } impl DeepSizeOf for NGramIndex { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.tokens.deep_size_of_children(context) } } diff --git a/rust/lance-index/src/scalar/rtree.rs b/rust/lance-index/src/scalar/rtree.rs index 5cefae36da6..5d5ac2a3a92 100644 --- a/rust/lance-index/src/scalar/rtree.rs +++ b/rust/lance-index/src/scalar/rtree.rs @@ -24,7 +24,6 @@ use async_trait::async_trait; use datafusion::execution::SendableRecordBatchStream; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion_common::DataFusionError; -use deepsize::DeepSizeOf; use futures::future::BoxFuture; use futures::{FutureExt, Stream, StreamExt, TryFutureExt, TryStreamExt, stream}; use geoarrow_array::array::{RectArray, from_arrow_array}; @@ -33,6 +32,7 @@ use geoarrow_array::{GeoArrowArray, GeoArrowArrayAccessor, IntoArrow}; use geoarrow_schema::{Dimension, RectType}; use lance_arrow::RecordBatchExt; use lance_core::cache::{CacheKey, LanceCache, WeakLanceCache}; +use lance_core::deepsize::DeepSizeOf; use lance_core::utils::address::RowAddress; use lance_core::utils::tempfile::TempDir; use lance_core::{Error, ROW_ID, Result}; @@ -235,8 +235,8 @@ pub enum RTreeCacheKey { pub struct RTreeCacheValue(Arc); impl DeepSizeOf for RTreeCacheValue { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { - self.0.get_array_memory_size() + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { + self.0.deep_size_of_children(context) } } @@ -430,7 +430,7 @@ impl RTreeIndex { } impl DeepSizeOf for RTreeIndex { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { let mut total_size = 0; total_size += self.store.deep_size_of_children(context); diff --git a/rust/lance-index/src/scalar/zonemap.rs b/rust/lance-index/src/scalar/zonemap.rs index 60f66c91c59..fda1613bf1b 100644 --- a/rust/lance-index/src/scalar/zonemap.rs +++ b/rust/lance-index/src/scalar/zonemap.rs @@ -40,9 +40,9 @@ use crate::scalar::FragReuseIndex; use crate::vector::VectorIndex; use crate::{Index, IndexType}; use async_trait::async_trait; -use deepsize::DeepSizeOf; use lance_core::Error; use lance_core::Result; +use lance_core::deepsize::DeepSizeOf; use roaring::RoaringBitmap; use super::zoned::{ZoneBound, ZoneProcessor, ZoneTrainer, rebuild_zones, search_zones}; @@ -66,7 +66,7 @@ struct ZoneMapStatistics { } impl DeepSizeOf for ZoneMapStatistics { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { // Estimate sizes for ScalarValue let min_size = self.min.size() - std::mem::size_of::(); let max_size = self.max.size() - std::mem::size_of::(); @@ -126,7 +126,7 @@ impl std::fmt::Debug for ZoneMapIndex { } impl DeepSizeOf for ZoneMapIndex { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.zones.deep_size_of_children(context) } } diff --git a/rust/lance-index/src/vector.rs b/rust/lance-index/src/vector.rs index a6ff3eb02d5..99f3acf1cb9 100644 --- a/rust/lance-index/src/vector.rs +++ b/rust/lance-index/src/vector.rs @@ -13,9 +13,9 @@ use arrow_schema::Field; use async_trait::async_trait; use datafusion::execution::SendableRecordBatchStream; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; -use deepsize::DeepSizeOf; use futures::stream; use ivf::storage::IvfModel; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, ROW_ID_FIELD, Result}; use lance_io::traits::Reader; use lance_linalg::distance::DistanceType; diff --git a/rust/lance-index/src/vector/bq/builder.rs b/rust/lance-index/src/vector/bq/builder.rs index df6e6591299..7aaa0aa29d2 100644 --- a/rust/lance-index/src/vector/bq/builder.rs +++ b/rust/lance-index/src/vector/bq/builder.rs @@ -8,8 +8,8 @@ use arrow::datatypes::{Float16Type, Float32Type, Float64Type}; use arrow_array::{Array, ArrayRef, FixedSizeListArray, UInt8Array}; use arrow_schema::{DataType, Field}; use bitvec::prelude::{BitVec, Lsb0}; -use deepsize::DeepSizeOf; use lance_arrow::{ArrowFloatType, FixedSizeListArrayExt, FloatArray, FloatType}; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; use ndarray::{Axis, ShapeBuilder, s}; use num_traits::{AsPrimitive, FromPrimitive}; diff --git a/rust/lance-index/src/vector/bq/storage.rs b/rust/lance-index/src/vector/bq/storage.rs index 4c2aeb7363e..e48f5d2fa4e 100644 --- a/rust/lance-index/src/vector/bq/storage.rs +++ b/rust/lance-index/src/vector/bq/storage.rs @@ -14,9 +14,9 @@ use arrow_array::{ use arrow_schema::{DataType, Field, SchemaRef}; use async_trait::async_trait; use bytes::{Bytes, BytesMut}; -use deepsize::DeepSizeOf; use itertools::Itertools; use lance_arrow::{ArrowFloatType, FixedSizeListArrayExt, FloatArray, RecordBatchExt}; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, ROW_ID, Result}; use lance_file::previous::reader::FileReader as PreviousFileReader; use lance_linalg::distance::{DistanceType, Dot}; @@ -201,10 +201,10 @@ impl RabitQuantizationMetadata { } impl DeepSizeOf for RabitQuantizationMetadata { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.rotate_mat .as_ref() - .map(|inv_p| inv_p.get_array_memory_size()) + .map(|inv_p| (inv_p as &dyn arrow_array::Array).deep_size_of_children(context)) .unwrap_or(0) + self .fast_rotation_signs @@ -288,8 +288,8 @@ pub struct RabitQuantizationStorage { } impl DeepSizeOf for RabitQuantizationStorage { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { - self.metadata.deep_size_of_children(context) + self.batch.get_array_memory_size() + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { + self.metadata.deep_size_of_children(context) + self.batch.deep_size_of_children(context) } } diff --git a/rust/lance-index/src/vector/flat/index.rs b/rust/lance-index/src/vector/flat/index.rs index 6ebab23688f..c12f23a7c15 100644 --- a/rust/lance-index/src/vector/flat/index.rs +++ b/rust/lance-index/src/vector/flat/index.rs @@ -10,7 +10,7 @@ use std::sync::Arc; use arrow::array::AsArray; use arrow_array::{Array, ArrayRef, Float32Array, RecordBatch, UInt64Array}; use arrow_schema::{DataType, Field, Schema, SchemaRef}; -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, ROW_ID_FIELD, Result}; use lance_file::previous::reader::FileReader as PreviousFileReader; use lance_linalg::distance::DistanceType; diff --git a/rust/lance-index/src/vector/flat/storage.rs b/rust/lance-index/src/vector/flat/storage.rs index 4d1bf840111..c3ec30d5086 100644 --- a/rust/lance-index/src/vector/flat/storage.rs +++ b/rust/lance-index/src/vector/flat/storage.rs @@ -17,7 +17,7 @@ use arrow_array::{ types::{Float32Type, UInt64Type}, }; use arrow_schema::{DataType, SchemaRef}; -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, ROW_ID, Result}; use lance_file::previous::reader::FileReader as PreviousFileReader; use lance_linalg::distance::hamming::hamming; @@ -38,7 +38,7 @@ pub struct FlatFloatStorage { } impl DeepSizeOf for FlatFloatStorage { - fn deep_size_of_children(&self, _: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _: &mut lance_core::deepsize::Context) -> usize { self.batch.get_array_memory_size() } } @@ -200,7 +200,7 @@ pub struct FlatBinStorage { } impl DeepSizeOf for FlatBinStorage { - fn deep_size_of_children(&self, _: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _: &mut lance_core::deepsize::Context) -> usize { self.batch.get_array_memory_size() } } diff --git a/rust/lance-index/src/vector/graph.rs b/rust/lance-index/src/vector/graph.rs index b79ce4e9947..097aa064d67 100644 --- a/rust/lance-index/src/vector/graph.rs +++ b/rust/lance-index/src/vector/graph.rs @@ -9,7 +9,7 @@ use std::collections::BinaryHeap; use std::sync::Arc; use arrow_schema::{DataType, Field}; -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; use crate::vector::hnsw::builder::HnswQueryParams; diff --git a/rust/lance-index/src/vector/graph/builder.rs b/rust/lance-index/src/vector/graph/builder.rs index 8183ae3acfe..36e42b7cf9a 100644 --- a/rust/lance-index/src/vector/graph/builder.rs +++ b/rust/lance-index/src/vector/graph/builder.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; use super::OrderedFloat; use super::OrderedNode; diff --git a/rust/lance-index/src/vector/hnsw.rs b/rust/lance-index/src/vector/hnsw.rs index 02e9d1682e2..a618f34753f 100644 --- a/rust/lance-index/src/vector/hnsw.rs +++ b/rust/lance-index/src/vector/hnsw.rs @@ -7,8 +7,8 @@ //! use arrow_schema::{DataType, Field}; -use deepsize::DeepSizeOf; use itertools::Itertools; +use lance_core::deepsize::DeepSizeOf; use serde::{Deserialize, Serialize}; use self::builder::HnswBuildParams; diff --git a/rust/lance-index/src/vector/hnsw/builder.rs b/rust/lance-index/src/vector/hnsw/builder.rs index 789dc7ef904..214750dfafa 100644 --- a/rust/lance-index/src/vector/hnsw/builder.rs +++ b/rust/lance-index/src/vector/hnsw/builder.rs @@ -8,8 +8,8 @@ use arrow::compute::concat_batches; use arrow::datatypes::{DataType, UInt32Type}; use arrow_array::{ArrayRef, Float32Array, ListArray, RecordBatch, UInt64Array}; use crossbeam_queue::ArrayQueue; -use deepsize::DeepSizeOf; use itertools::Itertools; +use lance_core::deepsize::DeepSizeOf; use lance_core::utils::tokio::get_num_compute_intensive_cpus; use lance_linalg::distance::DistanceType; @@ -165,7 +165,7 @@ struct HnswCore { } impl DeepSizeOf for HnswCore { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.params.deep_size_of_children(context) + self.graph.deep_size_of_children(context) + self.level_count.deep_size_of_children(context) @@ -497,7 +497,7 @@ struct HnswBuilder { } impl DeepSizeOf for HnswBuilder { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.params.deep_size_of_children(context) + self.nodes.deep_size_of_children(context) + self.level_count.deep_size_of_children(context) @@ -830,7 +830,7 @@ struct LoadedHnswGraph { } impl DeepSizeOf for LoadedHnswGraph { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { // `level_neighbors` are zero-copy views into `batch`, so counting // `batch` alone avoids double counting (mirrors // `vector/flat/storage.rs`). The upper-level `level_lookup` maps are @@ -950,7 +950,7 @@ enum HnswGraph { } impl DeepSizeOf for HnswGraph { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { match self { Self::Built(nodes) => nodes.deep_size_of_children(context), Self::Loaded(graph) => graph.deep_size_of_children(context), @@ -1306,8 +1306,8 @@ mod tests { use arrow_array::{ArrayRef, FixedSizeListArray, RecordBatch, UInt8Array, UInt32Array}; use arrow_schema::Schema; - use deepsize::DeepSizeOf; use lance_arrow::FixedSizeListArrayExt; + use lance_core::deepsize::DeepSizeOf; use lance_file::previous::{ reader::FileReader as PreviousFileReader, writer::{ diff --git a/rust/lance-index/src/vector/hnsw/index.rs b/rust/lance-index/src/vector/hnsw/index.rs index 8d19c2e634a..0ae42f59414 100644 --- a/rust/lance-index/src/vector/hnsw/index.rs +++ b/rust/lance-index/src/vector/hnsw/index.rs @@ -12,9 +12,9 @@ use arrow_array::{Float32Array, RecordBatch, UInt32Array}; use async_trait::async_trait; use datafusion::execution::SendableRecordBatchStream; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; -use deepsize::DeepSizeOf; use lance_arrow::RecordBatchExt; use lance_core::ROW_ID; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result, datatypes::Schema}; use lance_file::previous::reader::FileReader as PreviousFileReader; use lance_io::traits::Reader; diff --git a/rust/lance-index/src/vector/ivf/storage.rs b/rust/lance-index/src/vector/ivf/storage.rs index ed5e70f5514..5d58401bb12 100644 --- a/rust/lance-index/src/vector/ivf/storage.rs +++ b/rust/lance-index/src/vector/ivf/storage.rs @@ -4,9 +4,9 @@ use std::ops::Range; use arrow_array::{Array, ArrayRef, FixedSizeListArray, Float32Array, UInt32Array}; -use deepsize::DeepSizeOf; use itertools::Itertools; use lance_arrow::FixedSizeListArrayExt; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; use lance_file::previous::{ reader::FileReader as PreviousFileReader, writer::FileWriter as PreviousFileWriter, @@ -41,10 +41,10 @@ pub struct IvfModel { } impl DeepSizeOf for IvfModel { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.centroids .as_ref() - .map(|centroids| centroids.get_array_memory_size()) + .map(|centroids| (centroids as &dyn arrow_array::Array).deep_size_of_children(context)) .unwrap_or_default() + self.lengths.deep_size_of_children(context) + self.offsets.deep_size_of_children(context) diff --git a/rust/lance-index/src/vector/pq.rs b/rust/lance-index/src/vector/pq.rs index bbb05238a30..5749e56ed31 100644 --- a/rust/lance-index/src/vector/pq.rs +++ b/rust/lance-index/src/vector/pq.rs @@ -10,9 +10,9 @@ use arrow::datatypes::{self, ArrowPrimitiveType}; use arrow_array::{Array, FixedSizeListArray, UInt8Array, cast::AsArray}; use arrow_array::{ArrayRef, Float32Array, PrimitiveArray}; use arrow_schema::{DataType, Field}; -use deepsize::DeepSizeOf; use distance::build_distance_table_dot; use lance_arrow::*; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result, assume_eq}; use lance_linalg::distance::{DistanceType, Dot, L2, l2::L2Prepared}; use lance_table::utils::LanceIteratorExtension; @@ -54,7 +54,7 @@ pub struct ProductQuantizer { } impl DeepSizeOf for ProductQuantizer { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { self.codebook.get_array_memory_size() + self.num_sub_vectors.deep_size_of_children(_context) + self.num_bits.deep_size_of_children(_context) diff --git a/rust/lance-index/src/vector/pq/storage.rs b/rust/lance-index/src/vector/pq/storage.rs index 17443d15ea6..68747713aac 100644 --- a/rust/lance-index/src/vector/pq/storage.rs +++ b/rust/lance-index/src/vector/pq/storage.rs @@ -12,7 +12,7 @@ use std::{ }; use arrow::datatypes::{self, UInt8Type}; -use arrow_array::{Array, ArrayRef, ArrowPrimitiveType, PrimitiveArray}; +use arrow_array::{ArrayRef, ArrowPrimitiveType, PrimitiveArray}; use arrow_array::{ FixedSizeListArray, RecordBatch, UInt8Array, UInt64Array, cast::AsArray, @@ -21,8 +21,8 @@ use arrow_array::{ use arrow_schema::{DataType, SchemaRef}; use async_trait::async_trait; use bytes::{Bytes, BytesMut}; -use deepsize::DeepSizeOf; use lance_arrow::{FixedSizeListArrayExt, RecordBatchExt}; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, ROW_ID, Result}; use lance_file::previous::{ reader::FileReader as PreviousFileReader, writer::FileWriter as PreviousFileWriter, @@ -70,10 +70,10 @@ pub struct ProductQuantizationMetadata { } impl DeepSizeOf for ProductQuantizationMetadata { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.codebook .as_ref() - .map(|codebook| codebook.get_array_memory_size()) + .map(|codebook| (codebook as &dyn arrow_array::Array).deep_size_of_children(context)) .unwrap_or(0) } } @@ -166,18 +166,20 @@ pub struct ProductQuantizationStorage { } impl DeepSizeOf for ProductQuantizationStorage { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { - self.batch.get_array_memory_size() + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { + self.batch.deep_size_of_children(context) + self .metadata .codebook .as_ref() - .map(|codebook| codebook.get_array_memory_size()) + .map(|codebook| { + (codebook as &dyn arrow_array::Array).deep_size_of_children(context) + }) .unwrap_or(0) + self .pairwise_distance_table .get() - .map(|table| table.deep_size_of_children(_context)) + .map(|table| table.deep_size_of_children(context)) .unwrap_or(0) } } diff --git a/rust/lance-index/src/vector/quantizer.rs b/rust/lance-index/src/vector/quantizer.rs index 8e2dd333606..8ee64669f32 100644 --- a/rust/lance-index/src/vector/quantizer.rs +++ b/rust/lance-index/src/vector/quantizer.rs @@ -11,8 +11,8 @@ use arrow_array::{Array, ArrayRef, FixedSizeListArray, RecordBatch, UInt32Array, use arrow_schema::Field; use async_trait::async_trait; use bytes::Bytes; -use deepsize::DeepSizeOf; use lance_arrow::RecordBatchExt; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, ROW_ID, Result}; use lance_file::previous::reader::FileReader as PreviousFileReader; use lance_io::traits::Reader; @@ -296,7 +296,7 @@ pub struct IvfQuantizationStorage { } impl DeepSizeOf for IvfQuantizationStorage { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.reader.deep_size_of_children(context) + self.quantizer.deep_size_of_children(context) + self.metadata.deep_size_of_children(context) diff --git a/rust/lance-index/src/vector/sq.rs b/rust/lance-index/src/vector/sq.rs index 9ddfed8e9af..bdc3204021e 100644 --- a/rust/lance-index/src/vector/sq.rs +++ b/rust/lance-index/src/vector/sq.rs @@ -9,9 +9,9 @@ use arrow_array::{Array, ArrayRef, FixedSizeListArray, UInt8Array}; use arrow_schema::{DataType, Field}; use builder::SQBuildParams; -use deepsize::DeepSizeOf; use itertools::Itertools; use lance_arrow::*; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; use lance_linalg::distance::DistanceType; use num_traits::*; @@ -34,7 +34,7 @@ pub struct ScalarQuantizer { } impl DeepSizeOf for ScalarQuantizer { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { 0 } } diff --git a/rust/lance-index/src/vector/sq/storage.rs b/rust/lance-index/src/vector/sq/storage.rs index 58165e31caf..1e5eebda0d9 100644 --- a/rust/lance-index/src/vector/sq/storage.rs +++ b/rust/lance-index/src/vector/sq/storage.rs @@ -12,7 +12,7 @@ use arrow_array::{ }; use arrow_schema::{DataType, SchemaRef}; use async_trait::async_trait; -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, ROW_ID, Result}; use lance_file::previous::reader::FileReader as PreviousFileReader; use lance_io::object_store::ObjectStore; @@ -44,7 +44,7 @@ pub struct ScalarQuantizationMetadata { } impl DeepSizeOf for ScalarQuantizationMetadata { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { 0 } } @@ -138,8 +138,8 @@ impl SQStorageChunk { } impl DeepSizeOf for SQStorageChunk { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { - self.batch.get_array_memory_size() + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { + self.batch.deep_size_of_children(context) } } @@ -155,7 +155,7 @@ pub struct ScalarQuantizationStorage { } impl DeepSizeOf for ScalarQuantizationStorage { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.chunks .iter() .map(|c| c.deep_size_of_children(context)) diff --git a/rust/lance-index/src/vector/storage.rs b/rust/lance-index/src/vector/storage.rs index 1443a1f355d..f0924e021d0 100644 --- a/rust/lance-index/src/vector/storage.rs +++ b/rust/lance-index/src/vector/storage.rs @@ -7,9 +7,9 @@ use crate::vector::quantizer::QuantizerStorage; use arrow::compute::concat_batches; use arrow_array::{ArrayRef, RecordBatch}; use arrow_schema::SchemaRef; -use deepsize::DeepSizeOf; use futures::prelude::stream::TryStreamExt; use lance_arrow::RecordBatchExt; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, ROW_ID, Result}; use lance_encoding::decoder::FilterExpression; use lance_file::reader::FileReader; @@ -102,7 +102,7 @@ impl Default for QueryScratch { } impl DeepSizeOf for QueryScratch { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { self.distances.capacity() * size_of::() + self.query_f32.capacity() * size_of::() + self.u16.capacity() * size_of::() @@ -224,7 +224,7 @@ impl Drop for QueryScratchGuard<'_> { } impl DeepSizeOf for QueryScratchPool { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { let mut total = self.scratches.capacity() * size_of::(); let mut scratches = Vec::new(); while let Some(scratch) = self.scratches.pop() { @@ -381,7 +381,7 @@ pub struct IvfQuantizationStorage { } impl DeepSizeOf for IvfQuantizationStorage { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.metadata.deep_size_of_children(context) + self.ivf.deep_size_of_children(context) } } @@ -531,7 +531,7 @@ impl IvfQuantizationStorage { #[cfg(test)] mod tests { use super::{QueryScratchCapacity, QueryScratchPool}; - use deepsize::DeepSizeOf; + use lance_core::deepsize::DeepSizeOf; #[test] fn test_query_scratch_pool_reuses_buffers() { diff --git a/rust/lance-index/src/vector/v3/subindex.rs b/rust/lance-index/src/vector/v3/subindex.rs index 7c9667859f4..9a49bc95f1d 100644 --- a/rust/lance-index/src/vector/v3/subindex.rs +++ b/rust/lance-index/src/vector/v3/subindex.rs @@ -6,7 +6,7 @@ use std::fmt::Debug; use std::sync::Arc; use arrow_array::{ArrayRef, RecordBatch}; -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; use crate::metrics::MetricsCollector; diff --git a/rust/lance-io/Cargo.toml b/rust/lance-io/Cargo.toml index d94b811cec0..6cee04d5fd9 100644 --- a/rust/lance-io/Cargo.toml +++ b/rust/lance-io/Cargo.toml @@ -34,7 +34,6 @@ aws-credential-types = { workspace = true, optional = true } byteorder.workspace = true bytes.workspace = true chrono.workspace = true -deepsize.workspace = true futures.workspace = true http.workspace = true log.workspace = true diff --git a/rust/lance-io/src/local.rs b/rust/lance-io/src/local.rs index 12f846bcc52..2b8a339331a 100644 --- a/rust/lance-io/src/local.rs +++ b/rust/lance-io/src/local.rs @@ -16,8 +16,8 @@ use std::os::windows::fs::FileExt; use async_trait::async_trait; use bytes::{Bytes, BytesMut}; -use deepsize::DeepSizeOf; use futures::future::BoxFuture; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; use object_store::path::Path; use tokio::io::AsyncSeekExt; @@ -89,7 +89,7 @@ pub struct LocalObjectReader { } impl DeepSizeOf for LocalObjectReader { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { // Skipping `file` as it should just be a file handle self.path.as_ref().deep_size_of_children(context) } diff --git a/rust/lance-io/src/object_reader.rs b/rust/lance-io/src/object_reader.rs index d6d5de98f0b..1c27800c90f 100644 --- a/rust/lance-io/src/object_reader.rs +++ b/rust/lance-io/src/object_reader.rs @@ -11,12 +11,12 @@ use crate::local::read_exact_at; use std::os::unix::fs::FileExt; use bytes::Bytes; -use deepsize::DeepSizeOf; use futures::{ FutureExt, future::{BoxFuture, Shared}, stream::{self, StreamExt}, }; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result, error::CloneableError}; use object_store::ObjectStoreExt; use object_store::{GetOptions, GetResult, ObjectStore, Result as OSResult, path::Path}; @@ -74,7 +74,7 @@ pub struct CloudObjectReader { } impl DeepSizeOf for CloudObjectReader { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { // Skipping object_store because there is no easy way to do that and it shouldn't be too big self.path.as_ref().deep_size_of_children(context) } @@ -449,7 +449,7 @@ pub(crate) fn stream_local_range( } impl DeepSizeOf for SmallReader { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { let mut size = self.inner.path.as_ref().deep_size_of_children(context); if let Ok(guard) = self.inner.state.try_lock() diff --git a/rust/lance-io/src/object_store.rs b/rust/lance-io/src/object_store.rs index 1a4fd18c01e..0c44095f117 100644 --- a/rust/lance-io/src/object_store.rs +++ b/rust/lance-io/src/object_store.rs @@ -13,9 +13,9 @@ use std::time::Duration; use async_trait::async_trait; use bytes::Bytes; use chrono::{DateTime, Utc}; -use deepsize::DeepSizeOf; use futures::{FutureExt, Stream}; use futures::{StreamExt, TryStreamExt, future, stream::BoxStream}; +use lance_core::deepsize::DeepSizeOf; use lance_core::error::LanceOptionExt; use lance_core::utils::parse::str_is_truthy; use list_retry::ListRetryStream; @@ -153,7 +153,7 @@ pub struct ObjectStore { } impl DeepSizeOf for ObjectStore { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { // We aren't counting `inner` here which is problematic but an ObjectStore // shouldn't be too big. The only exception might be the write cache but, if // the writer cache has data, it means we're using it somewhere else that isn't diff --git a/rust/lance-io/src/scheduler.rs b/rust/lance-io/src/scheduler.rs index 5993b161497..4f43cb00668 100644 --- a/rust/lance-io/src/scheduler.rs +++ b/rust/lance-io/src/scheduler.rs @@ -1411,8 +1411,8 @@ mod tests { path: Path, } - impl deepsize::DeepSizeOf for TrackingReader { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + impl lance_core::deepsize::DeepSizeOf for TrackingReader { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { 0 } } diff --git a/rust/lance-io/src/traits.rs b/rust/lance-io/src/traits.rs index eb83cf1a2ba..6a40171b6e0 100644 --- a/rust/lance-io/src/traits.rs +++ b/rust/lance-io/src/traits.rs @@ -5,8 +5,8 @@ use std::ops::Range; use async_trait::async_trait; use bytes::Bytes; -use deepsize::DeepSizeOf; use futures::{StreamExt, future::BoxFuture, stream::BoxStream}; +use lance_core::deepsize::DeepSizeOf; use object_store::path::Path; use prost::Message; use tokio::io::{AsyncWrite, AsyncWriteExt}; diff --git a/rust/lance-io/src/uring/current_thread.rs b/rust/lance-io/src/uring/current_thread.rs index bc09af058e6..abac772218b 100644 --- a/rust/lance-io/src/uring/current_thread.rs +++ b/rust/lance-io/src/uring/current_thread.rs @@ -14,10 +14,10 @@ use crate::traits::Reader; use crate::uring::DEFAULT_URING_QUEUE_DEPTH; use crate::utils::tracking_store::IOTracker; use bytes::{Bytes, BytesMut}; -use deepsize::DeepSizeOf; use futures::future::BoxFuture; use futures::{FutureExt, TryFutureExt}; use io_uring::{IoUring, opcode, types}; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; use object_store::path::Path; @@ -258,7 +258,7 @@ pub struct UringCurrentThreadReader { } impl DeepSizeOf for UringCurrentThreadReader { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { // Skip file handle (just a system resource) // Only count the path's deep size self.handle.path.as_ref().deep_size_of_children(context) diff --git a/rust/lance-io/src/uring/reader.rs b/rust/lance-io/src/uring/reader.rs index 0e7b0101ba8..a948e6c63dc 100644 --- a/rust/lance-io/src/uring/reader.rs +++ b/rust/lance-io/src/uring/reader.rs @@ -12,9 +12,9 @@ use crate::traits::Reader; use crate::uring::requests::RequestState; use crate::utils::tracking_store::IOTracker; use bytes::{Bytes, BytesMut}; -use deepsize::DeepSizeOf; use futures::future::BoxFuture; use futures::{FutureExt, TryFutureExt}; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; use object_store::path::Path; use std::fs::File; @@ -109,7 +109,7 @@ pub struct UringReader { } impl DeepSizeOf for UringReader { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { // Skip file handle (just a system resource) // Only count the path's deep size self.handle.path.as_ref().deep_size_of_children(context) diff --git a/rust/lance-io/src/utils.rs b/rust/lance-io/src/utils.rs index 48d28526a5e..b36dff75133 100644 --- a/rust/lance-io/src/utils.rs +++ b/rust/lance-io/src/utils.rs @@ -10,8 +10,8 @@ use arrow_array::{ use arrow_schema::DataType; use byteorder::{ByteOrder, LittleEndian}; use bytes::Bytes; -use deepsize::DeepSizeOf; use lance_arrow::*; +use lance_core::deepsize::DeepSizeOf; use prost::Message; use serde::{Deserialize, Serialize}; diff --git a/rust/lance-linalg/Cargo.toml b/rust/lance-linalg/Cargo.toml index 19d7ad4da47..cf91deb69d7 100644 --- a/rust/lance-linalg/Cargo.toml +++ b/rust/lance-linalg/Cargo.toml @@ -13,7 +13,6 @@ categories = { workspace = true } arrow-array = { workspace = true } arrow-buffer = { workspace = true } arrow-schema = { workspace = true } -deepsize = { workspace = true } half = { workspace = true } lance-arrow = { workspace = true } lance-core = { workspace = true } diff --git a/rust/lance-linalg/src/distance.rs b/rust/lance-linalg/src/distance.rs index 9dd57edb3f9..a356d5c1225 100644 --- a/rust/lance-linalg/src/distance.rs +++ b/rust/lance-linalg/src/distance.rs @@ -26,10 +26,10 @@ pub mod l2_u8; pub mod norm_l2; pub use cosine::*; -use deepsize::DeepSizeOf; pub use dot::*; use hamming::hamming_distance_arrow_batch; pub use l2::*; +use lance_core::deepsize::DeepSizeOf; pub use norm_l2::*; use crate::Result; diff --git a/rust/lance-linalg/src/distance/l2.rs b/rust/lance-linalg/src/distance/l2.rs index c830d103df4..c47aedd749f 100644 --- a/rust/lance-linalg/src/distance/l2.rs +++ b/rust/lance-linalg/src/distance/l2.rs @@ -15,10 +15,10 @@ use arrow_array::{ types::{Float16Type, Float32Type, Float64Type, Int8Type}, }; use arrow_schema::DataType; -use deepsize::DeepSizeOf; use half::{bf16, f16}; use lance_arrow::{ArrowFloatType, FixedSizeListArrayExt, FloatArray}; use lance_core::assume_eq; +use lance_core::deepsize::DeepSizeOf; use lance_core::utils::cpu::SIMD_SUPPORT; #[cfg(feature = "fp16kernels")] use lance_core::utils::cpu::SimdSupport; diff --git a/rust/lance-select/Cargo.toml b/rust/lance-select/Cargo.toml index 4d72b55b5e2..4cba7f082a8 100644 --- a/rust/lance-select/Cargo.toml +++ b/rust/lance-select/Cargo.toml @@ -18,7 +18,6 @@ arrow-schema = { workspace = true } byteorder = { workspace = true } tracing = { workspace = true } bytes = { workspace = true } -deepsize = { workspace = true } itertools = { workspace = true } lance-core = { workspace = true } roaring = { workspace = true } diff --git a/rust/lance-select/src/mask.rs b/rust/lance-select/src/mask.rs index a10ad9a6f50..b76e0de9a2b 100644 --- a/rust/lance-select/src/mask.rs +++ b/rust/lance-select/src/mask.rs @@ -9,8 +9,8 @@ use std::{collections::BTreeMap, io::Read}; use arrow_array::{Array, BinaryArray, GenericBinaryArray}; use arrow_buffer::{Buffer, NullBuffer, OffsetBuffer}; use byteorder::{ReadBytesExt, WriteBytesExt}; -use deepsize::DeepSizeOf; use itertools::Itertools; +use lance_core::deepsize::DeepSizeOf; use roaring::{MultiOps, RoaringBitmap, RoaringTreemap}; use lance_core::cache::CacheCodecImpl; @@ -308,7 +308,7 @@ pub enum RowAddrSelection { } impl DeepSizeOf for RowAddrSelection { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { match self { Self::Full => 0, Self::Partial(bitmap) => bitmap.serialized_size(), @@ -1866,7 +1866,7 @@ mod tests { #[test] fn test_row_addr_selection_deep_size_of() { - use deepsize::DeepSizeOf; + use lance_core::deepsize::DeepSizeOf; // Test Full variant - should have minimal size (just the enum discriminant) let full = RowAddrSelection::Full; diff --git a/rust/lance-select/src/mask/nullable.rs b/rust/lance-select/src/mask/nullable.rs index f76838170f3..2f2cecac8eb 100644 --- a/rust/lance-select/src/mask/nullable.rs +++ b/rust/lance-select/src/mask/nullable.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; use super::{RowAddrMask, RowAddrTreeMap, RowSetOps}; diff --git a/rust/lance-table/Cargo.toml b/rust/lance-table/Cargo.toml index 836b54e942f..042ae92c618 100644 --- a/rust/lance-table/Cargo.toml +++ b/rust/lance-table/Cargo.toml @@ -28,7 +28,6 @@ aws-sdk-dynamodb = { workspace = true, optional = true, default-features = false byteorder.workspace = true bytes.workspace = true chrono.workspace = true -deepsize.workspace = true futures.workspace = true log.workspace = true object_store.workspace = true diff --git a/rust/lance-table/benches/manifest_intern.rs b/rust/lance-table/benches/manifest_intern.rs index aa798a0bfc2..78b7e352207 100644 --- a/rust/lance-table/benches/manifest_intern.rs +++ b/rust/lance-table/benches/manifest_intern.rs @@ -11,7 +11,7 @@ //! `RowDatasetVersionMeta::Inline` bytes across many fragments. use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; use prost::Message; use lance_table::format::pb; diff --git a/rust/lance-table/src/format/fragment.rs b/rust/lance-table/src/format/fragment.rs index dc5c94b388a..431e466dbd4 100644 --- a/rust/lance-table/src/format/fragment.rs +++ b/rust/lance-table/src/format/fragment.rs @@ -5,8 +5,8 @@ use std::collections::HashMap; use std::num::NonZero; use std::sync::Arc; -use deepsize::DeepSizeOf; use lance_core::Error; +use lance_core::deepsize::DeepSizeOf; use lance_file::format::{MAJOR_VERSION, MINOR_VERSION}; use lance_file::version::LanceFileVersion; use lance_io::utils::CachedFileSize; diff --git a/rust/lance-table/src/format/index.rs b/rust/lance-table/src/format/index.rs index 945d8364123..33ee464fe76 100644 --- a/rust/lance-table/src/format/index.rs +++ b/rust/lance-table/src/format/index.rs @@ -7,8 +7,8 @@ use std::collections::HashMap; use std::sync::Arc; use chrono::{DateTime, Utc}; -use deepsize::DeepSizeOf; use futures::StreamExt; +use lance_core::deepsize::DeepSizeOf; use lance_io::object_store::ObjectStore; use object_store::path::Path; use roaring::RoaringBitmap; @@ -121,7 +121,7 @@ impl IndexMetadata { } impl DeepSizeOf for IndexMetadata { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.uuid.as_bytes().deep_size_of_children(context) + self.fields.deep_size_of_children(context) + self.name.deep_size_of_children(context) diff --git a/rust/lance-table/src/format/manifest.rs b/rust/lance-table/src/format/manifest.rs index d2b5f2d31c6..9845061b7e4 100644 --- a/rust/lance-table/src/format/manifest.rs +++ b/rust/lance-table/src/format/manifest.rs @@ -3,7 +3,7 @@ use async_trait::async_trait; use chrono::prelude::*; -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; use lance_file::datatypes::{Fields, FieldsWithMeta, populate_schema_dictionary}; use lance_file::previous::reader::FileReader as PreviousFileReader; use lance_file::version::{LEGACY_FORMAT_VERSION, LanceFileVersion}; @@ -588,7 +588,7 @@ impl BasePath { } impl DeepSizeOf for BasePath { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.name.deep_size_of_children(context) + self.path.deep_size_of_children(context) * 2 + size_of::() diff --git a/rust/lance-table/src/rowids.rs b/rust/lance-table/src/rowids.rs index 0b56be84f56..6975d798143 100644 --- a/rust/lance-table/src/rowids.rs +++ b/rust/lance-table/src/rowids.rs @@ -22,7 +22,7 @@ pub mod segment; mod serde; pub mod version; -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; // These are the public API. pub use index::FragmentRowIdIndex; pub use index::RowIdIndex; diff --git a/rust/lance-table/src/rowids/bitmap.rs b/rust/lance-table/src/rowids/bitmap.rs index 9001c04c3a0..ce7eadd5634 100644 --- a/rust/lance-table/src/rowids/bitmap.rs +++ b/rust/lance-table/src/rowids/bitmap.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; #[derive(PartialEq, Eq, Clone, DeepSizeOf)] pub struct Bitmap { diff --git a/rust/lance-table/src/rowids/encoded_array.rs b/rust/lance-table/src/rowids/encoded_array.rs index 06614765af1..7564cb6bb21 100644 --- a/rust/lance-table/src/rowids/encoded_array.rs +++ b/rust/lance-table/src/rowids/encoded_array.rs @@ -3,7 +3,7 @@ use std::ops::Range; -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; /// Encoded array of u64 values. /// diff --git a/rust/lance-table/src/rowids/index.rs b/rust/lance-table/src/rowids/index.rs index 718f2b8f2a9..66720ed1f25 100644 --- a/rust/lance-table/src/rowids/index.rs +++ b/rust/lance-table/src/rowids/index.rs @@ -5,8 +5,8 @@ use std::ops::RangeInclusive; use std::sync::Arc; use super::{RowIdSequence, U64Segment}; -use deepsize::DeepSizeOf; use lance_core::Result; +use lance_core::deepsize::DeepSizeOf; use lance_core::utils::address::RowAddress; use lance_core::utils::deletion::DeletionVector; use rangemap::RangeInclusiveMap; @@ -120,7 +120,7 @@ impl RowIdIndex { } impl DeepSizeOf for RowIdIndex { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.0 .iter() .map(|(_, (row_id_segment, address_segment))| { diff --git a/rust/lance-table/src/rowids/segment.rs b/rust/lance-table/src/rowids/segment.rs index a02acd8a573..6fba8599016 100644 --- a/rust/lance-table/src/rowids/segment.rs +++ b/rust/lance-table/src/rowids/segment.rs @@ -4,7 +4,7 @@ use std::ops::{Range, RangeInclusive}; use super::{bitmap::Bitmap, encoded_array::EncodedU64Array}; -use deepsize::DeepSizeOf; +use lance_core::deepsize::DeepSizeOf; /// Convert an estimated serialized byte cost from `u128` to `usize`, saturating /// at [`usize::MAX`] when the value does not fit (infeasible encodings). @@ -70,7 +70,7 @@ pub enum U64Segment { } impl DeepSizeOf for U64Segment { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { match self { Self::Range(_) => 0, Self::RangeWithHoles { holes, .. } => holes.deep_size_of_children(context), diff --git a/rust/lance-table/src/rowids/version.rs b/rust/lance-table/src/rowids/version.rs index 80f3d06db60..6ddb083c36e 100644 --- a/rust/lance-table/src/rowids/version.rs +++ b/rust/lance-table/src/rowids/version.rs @@ -9,9 +9,9 @@ use std::sync::Arc; -use deepsize::DeepSizeOf; use lance_core::Error; use lance_core::Result; +use lance_core::deepsize::DeepSizeOf; use prost::Message; use serde::de::Deserializer; use serde::ser::Serializer; diff --git a/rust/lance-table/src/system_index/frag_reuse.rs b/rust/lance-table/src/system_index/frag_reuse.rs index 141f35688d4..40bbc4f58b6 100644 --- a/rust/lance-table/src/system_index/frag_reuse.rs +++ b/rust/lance-table/src/system_index/frag_reuse.rs @@ -6,7 +6,7 @@ use std::{collections::HashMap, sync::Arc}; use arrow_array::cast::AsArray; use arrow_array::types::UInt64Type; use arrow_array::{Array, ArrayRef, PrimitiveArray, RecordBatch, UInt64Array}; -use deepsize::{Context, DeepSizeOf}; +use lance_core::deepsize::{Context, DeepSizeOf}; use lance_core::{Error, Result}; use lance_select::RowAddrTreeMap; use roaring::{RoaringBitmap, RoaringTreemap}; diff --git a/rust/lance-table/src/system_index/mem_wal.rs b/rust/lance-table/src/system_index/mem_wal.rs index 9b42cf90e6c..3bf279df062 100644 --- a/rust/lance-table/src/system_index/mem_wal.rs +++ b/rust/lance-table/src/system_index/mem_wal.rs @@ -3,8 +3,8 @@ use std::collections::HashMap; -use deepsize::DeepSizeOf; use lance_core::Error; +use lance_core::deepsize::DeepSizeOf; use serde::{Deserialize, Serialize}; use uuid::Uuid; @@ -48,7 +48,7 @@ pub struct MergedGeneration { } impl DeepSizeOf for MergedGeneration { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { 0 // UUID is 16 bytes fixed size, no heap allocations } } @@ -169,7 +169,7 @@ pub struct ShardManifest { } impl DeepSizeOf for ShardManifest { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.shard_field_values.deep_size_of_children(context) + self.flushed_generations.deep_size_of_children(context) } diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml index e1d9182b24d..74e6faf5c07 100644 --- a/rust/lance/Cargo.toml +++ b/rust/lance/Cargo.toml @@ -56,7 +56,6 @@ crossbeam-queue = { workspace = true } crossbeam-skiplist.workspace = true # This is already used by datafusion dashmap = "6" -deepsize.workspace = true # matches arrow-rs use half.workspace = true # Fast non-cryptographic hasher for the hot FTS mem-index insert path. diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index 2e448dfa828..a5d47d0a5bc 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -8,10 +8,10 @@ use arrow_array::{RecordBatch, RecordBatchReader}; use arrow_schema::DataType; use byteorder::{ByteOrder, LittleEndian}; use chrono::{Duration, prelude::*}; -use deepsize::DeepSizeOf; use futures::future::BoxFuture; use futures::stream::{self, BoxStream, StreamExt, TryStreamExt}; use futures::{FutureExt, Stream}; +use lance_core::deepsize::DeepSizeOf; use crate::dataset::metadata::UpdateFieldMetadataBuilder; use crate::dataset::transaction::translate_schema_metadata_updates; diff --git a/rust/lance/src/dataset/transaction.rs b/rust/lance/src/dataset/transaction.rs index 3f96b9964d5..4555cd7ee6c 100644 --- a/rust/lance/src/dataset/transaction.rs +++ b/rust/lance/src/dataset/transaction.rs @@ -17,11 +17,11 @@ use super::write::merge_insert::inserted_rows::KeyExistenceFilter; use crate::dataset::transaction::UpdateMode::{RewriteColumns, RewriteRows}; use crate::index::mem_wal::update_mem_wal_index_merged_generations; use crate::utils::temporal::timestamp_to_nanos; -use deepsize::DeepSizeOf; use lance_core::datatypes::{ LANCE_UNENFORCED_CLUSTERING_KEY_POSITION, LANCE_UNENFORCED_PRIMARY_KEY, LANCE_UNENFORCED_PRIMARY_KEY_POSITION, }; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result, datatypes::Schema}; use lance_file::{datatypes::Fields, version::LanceFileVersion}; use lance_index::mem_wal::MergedGeneration; @@ -476,7 +476,7 @@ pub enum UpdateMode { pub struct UpdatedFragmentOffsets(pub HashMap); impl DeepSizeOf for UpdatedFragmentOffsets { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.0.iter().fold(0_usize, |acc, (frag_id, bitmap)| { acc + frag_id.deep_size_of_children(context) + (bitmap.len() as usize).saturating_mul(std::mem::size_of::()) @@ -1361,7 +1361,7 @@ pub struct RewrittenIndex { } impl DeepSizeOf for RewrittenIndex { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.new_index_details .type_url .deep_size_of_children(context) diff --git a/rust/lance/src/dataset/write/merge_insert/inserted_rows.rs b/rust/lance/src/dataset/write/merge_insert/inserted_rows.rs index a0a7f93b653..805073e75e2 100644 --- a/rust/lance/src/dataset/write/merge_insert/inserted_rows.rs +++ b/rust/lance/src/dataset/write/merge_insert/inserted_rows.rs @@ -13,8 +13,8 @@ use arrow_array::{ StringArray, StructArray, }; use arrow_schema::DataType; -use deepsize::DeepSizeOf; use lance_core::Result; +use lance_core::deepsize::DeepSizeOf; use lance_core::utils::bloomfilter::sbbf::{Sbbf, SbbfBuilder}; use lance_table::format::pb; diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index c628dbe5919..6dbaebed731 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -340,7 +340,7 @@ impl CacheKey for IvfIndexStateCacheKey<'_> { /// Wrapper that stores a live VectorIndex in the cache. /// Used for v0.1/v0.2 indices that don't support serializable caching. -#[derive(Debug, deepsize::DeepSizeOf)] +#[derive(Debug, lance_core::deepsize::DeepSizeOf)] pub(crate) struct CachedLegacyVectorIndex(Arc); #[derive(Debug, Clone)] diff --git a/rust/lance/src/index/scalar_logical.rs b/rust/lance/src/index/scalar_logical.rs index d2ab4e4b9f0..294f9029289 100644 --- a/rust/lance/src/index/scalar_logical.rs +++ b/rust/lance/src/index/scalar_logical.rs @@ -7,8 +7,8 @@ use std::any::Any; use std::sync::Arc; use async_trait::async_trait; -use deepsize::{Context, DeepSizeOf}; use futures::future::try_join_all; +use lance_core::deepsize::{Context, DeepSizeOf}; use lance_core::{Error, Result}; use lance_index::metrics::MetricsCollector; use lance_index::scalar::{AnyQuery, CreatedIndex, ScalarIndex, SearchResult, UpdateCriteria}; diff --git a/rust/lance/src/index/vector/fixture_test.rs b/rust/lance/src/index/vector/fixture_test.rs index 6d577c73ff8..906c6514ded 100644 --- a/rust/lance/src/index/vector/fixture_test.rs +++ b/rust/lance/src/index/vector/fixture_test.rs @@ -18,8 +18,8 @@ mod test { use arrow_schema::{DataType, Field, Schema}; use async_trait::async_trait; use datafusion::execution::SendableRecordBatchStream; - use deepsize::{Context, DeepSizeOf}; use lance_arrow::FixedSizeListArrayExt; + use lance_core::deepsize::{Context, DeepSizeOf}; use lance_core::{cache::LanceCache, utils::tempfile::TempStdFile}; use lance_index::vector::v3::subindex::SubIndexType; use lance_index::{Index, IndexType, vector::Query}; @@ -54,7 +54,8 @@ mod test { impl DeepSizeOf for ResidualCheckMockIndex { fn deep_size_of_children(&self, cx: &mut Context) -> usize { - self.assert_query_value.deep_size_of_children(cx) + self.ret_val.get_array_memory_size() + self.assert_query_value.deep_size_of_children(cx) + + self.ret_val.deep_size_of_children(cx) } } diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs index aa820da3fa3..9b619da2eac 100644 --- a/rust/lance/src/index/vector/ivf.rs +++ b/rust/lance/src/index/vector/ivf.rs @@ -35,7 +35,6 @@ use arrow_buffer::MutableBuffer; use arrow_schema::{DataType, Schema}; use async_trait::async_trait; use datafusion::execution::SendableRecordBatchStream; -use deepsize::DeepSizeOf; use futures::TryFutureExt; use futures::{ Stream, TryStreamExt, @@ -43,6 +42,7 @@ use futures::{ }; use io::write_hnsw_quantization_index_partitions; use lance_arrow::*; +use lance_core::deepsize::DeepSizeOf; use lance_core::{ Error, ROW_ID_FIELD, Result, cache::{LanceCache, UnsizedCacheKey, WeakLanceCache}, @@ -169,7 +169,7 @@ pub struct IVFIndex { } impl DeepSizeOf for IVFIndex { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.uuid.deep_size_of_children(context) + self.reader.deep_size_of_children(context) + self.sub_index.deep_size_of_children(context) diff --git a/rust/lance/src/index/vector/ivf/v2.rs b/rust/lance/src/index/vector/ivf/v2.rs index 510b0cc45a6..36be2398900 100644 --- a/rust/lance/src/index/vector/ivf/v2.rs +++ b/rust/lance/src/index/vector/ivf/v2.rs @@ -21,13 +21,13 @@ use async_trait::async_trait; use datafusion::error::{DataFusionError, Result as DataFusionResult}; use datafusion::execution::SendableRecordBatchStream; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; -use deepsize::DeepSizeOf; use futures::future::BoxFuture; use futures::prelude::stream::{self, TryStreamExt}; use futures::{StreamExt, TryFutureExt}; use lance_arrow::RecordBatchExt; use lance_arrow::ipc::write_len_prefixed_bytes; use lance_core::cache::{CacheCodec, CacheCodecImpl, CacheKey, LanceCache, WeakLanceCache}; +use lance_core::deepsize::DeepSizeOf; use lance_core::utils::tokio::{get_num_compute_intensive_cpus, spawn_cpu}; use lance_core::utils::tracing::{IO_TYPE_LOAD_VECTOR_PART, TRACE_IO_EVENTS}; use lance_core::{Error, ROW_ID, Result}; @@ -119,7 +119,7 @@ struct PreparedPartitionSearch { } impl DeepSizeOf for IvfIndexState { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.index_file_path.deep_size_of_children(context) + self.uuid.deep_size_of_children(context) + self.ivf.deep_size_of_children(context) @@ -178,7 +178,7 @@ pub(crate) trait IvfStateEntry: DeepSizeOf + Send + Sync + 'static { pub(crate) struct IvfStateEntryBox(pub(crate) Arc); impl DeepSizeOf for IvfStateEntryBox { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.0.deep_size_of_children(context) } } @@ -394,8 +394,8 @@ struct CachedIndexReaders { aux_reader: Arc, } -impl deepsize::DeepSizeOf for CachedIndexReaders { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { +impl lance_core::deepsize::DeepSizeOf for CachedIndexReaders { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { // FileReader doesn't impl DeepSizeOf. We approximate by counting the // fixed struct size for each reader plus the Arc // heap contents. The metadata Arcs are also held by FileMetadataCacheKey @@ -537,7 +537,7 @@ pub struct IVFIndex { } impl DeepSizeOf for IVFIndex { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.uri.deep_size_of_children(context) + self.index_path.deep_size_of_children(context) + self.ivf.deep_size_of_children(context) diff --git a/rust/lance/src/index/vector/pq.rs b/rust/lance/src/index/vector/pq.rs index 1f1a89d80c8..1c69cfcd223 100644 --- a/rust/lance/src/index/vector/pq.rs +++ b/rust/lance/src/index/vector/pq.rs @@ -17,8 +17,8 @@ use arrow_select::take::take; use async_trait::async_trait; use datafusion::execution::SendableRecordBatchStream; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; -use deepsize::DeepSizeOf; use lance_arrow::FixedSizeListArrayExt; +use lance_core::deepsize::DeepSizeOf; use lance_core::utils::address::RowAddress; use lance_core::utils::tokio::spawn_cpu; use lance_core::{ROW_ID, ROW_ID_FIELD}; @@ -71,17 +71,29 @@ pub struct PQIndex { } impl DeepSizeOf for PQIndex { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.pq.deep_size_of_children(context) + self .code .as_ref() - .map(|code| code.get_array_memory_size()) + .map(|code| { + if context.mark_seen(Arc::as_ptr(code) as *const () as usize) { + (code.as_ref() as &dyn arrow_array::Array).deep_size_of_children(context) + } else { + 0 + } + }) .unwrap_or(0) + self .row_ids .as_ref() - .map(|row_ids| row_ids.get_array_memory_size()) + .map(|row_ids| { + if context.mark_seen(Arc::as_ptr(row_ids) as *const () as usize) { + (row_ids.as_ref() as &dyn arrow_array::Array).deep_size_of_children(context) + } else { + 0 + } + }) .unwrap_or(0) } } diff --git a/rust/lance/src/io/exec/knn.rs b/rust/lance/src/io/exec/knn.rs index 910110011f4..b8adcfc1dbe 100644 --- a/rust/lance/src/io/exec/knn.rs +++ b/rust/lance/src/io/exec/knn.rs @@ -1892,7 +1892,7 @@ mod tests { use async_trait::async_trait; use datafusion::error::Result as DataFusionResult; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; - use deepsize::DeepSizeOf; + use lance_core::deepsize::DeepSizeOf; use lance_core::utils::tempfile::TempStrDir; use lance_datafusion::exec::{ExecutionStatsCallback, ExecutionSummaryCounts}; use lance_datafusion::utils::FIND_PARTITIONS_ELAPSED_METRIC; diff --git a/rust/lance/src/session.rs b/rust/lance/src/session.rs index b032cbaa15e..484d53c066a 100644 --- a/rust/lance/src/session.rs +++ b/rust/lance/src/session.rs @@ -4,8 +4,8 @@ use std::collections::HashMap; use std::sync::Arc; -use deepsize::DeepSizeOf; use lance_core::cache::{CacheBackend, LanceCache}; +use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; use lance_index::IndexType; use lance_io::object_store::ObjectStoreRegistry; @@ -56,7 +56,7 @@ pub struct Session { } impl DeepSizeOf for Session { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { let mut size = 0; // Measure the actual cache contents through the wrapper types size += self.index_cache.deep_size_of_children(context); diff --git a/rust/lance/src/session/caches.rs b/rust/lance/src/session/caches.rs index 82dc755f6c0..a2dda6069ab 100644 --- a/rust/lance/src/session/caches.rs +++ b/rust/lance/src/session/caches.rs @@ -12,7 +12,7 @@ use std::{borrow::Cow, ops::Deref}; -use deepsize::{Context, DeepSizeOf}; +use lance_core::deepsize::{Context, DeepSizeOf}; use lance_core::{ cache::{CacheKey, LanceCache}, utils::deletion::DeletionVector, diff --git a/rust/lance/src/session/index_caches.rs b/rust/lance/src/session/index_caches.rs index 3ae777880aa..e01261a1467 100644 --- a/rust/lance/src/session/index_caches.rs +++ b/rust/lance/src/session/index_caches.rs @@ -12,8 +12,8 @@ use std::{borrow::Cow, ops::Deref, sync::Arc}; -use deepsize::{Context, DeepSizeOf}; use lance_core::cache::{CacheKey, LanceCache}; +use lance_core::deepsize::{Context, DeepSizeOf}; use lance_index::frag_reuse::FragReuseIndex; use lance_table::format::IndexMetadata; use uuid::Uuid; diff --git a/rust/lance/src/session/index_extension.rs b/rust/lance/src/session/index_extension.rs index f5e7741441f..d4aeb6dae0d 100644 --- a/rust/lance/src/session/index_extension.rs +++ b/rust/lance/src/session/index_extension.rs @@ -3,8 +3,8 @@ use std::sync::Arc; -use deepsize::DeepSizeOf; use lance_core::Result; +use lance_core::deepsize::DeepSizeOf; use lance_file::previous::reader::FileReader as PreviousFileReader; use lance_index::{IndexParams, IndexType, vector::VectorIndex}; @@ -69,7 +69,7 @@ mod test { use arrow_array::{Float32Array, RecordBatch, UInt32Array}; use arrow_schema::Schema; use datafusion::execution::SendableRecordBatchStream; - use deepsize::DeepSizeOf; + use lance_core::deepsize::DeepSizeOf; use lance_file::previous::writer::{ FileWriter as PreviousFileWriter, FileWriterOptions as PreviousFileWriterOptions, }; @@ -95,7 +95,7 @@ mod test { struct MockIndex; impl DeepSizeOf for MockIndex { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { 0 } } @@ -230,7 +230,7 @@ mod test { } impl DeepSizeOf for MockIndexExtension { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + fn deep_size_of_children(&self, _context: &mut lance_core::deepsize::Context) -> usize { todo!() } }