From aba63239f0cc796507a4022b85c96c53c3054952 Mon Sep 17 00:00:00 2001 From: Mark Ko Date: Fri, 5 Jun 2026 00:39:52 +0000 Subject: [PATCH] fix - model-loader : add --reclaim-mmap-source to drop dormant mmap pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After a tensor is copied out of the mmap into a separate buffer (e.g. CPU weight repacking), the read-only file-backed source pages are dormant but still sit in the RSS. With --reclaim-mmap-source, madvise(MADV_DONTNEED) drops them; the mapping stays valid and re-faults from the file if ever touched. Linux only, flag-gated, skipped under --mlock. The range is rounded inward to whole pages. Addresses #16761. default OFF, but since it's a bug fix for issue #16761 (https://github.com/ggml-org/llama.cpp/issues/16761) So I'm happy to flip default to ON and/or use a ENV VAR for clean prepared-to-remove variable. ```bash ./build/bin/llama-completion "${ARGS[@]}" >off.txt 2>/dev/null ./build/bin/llama-completion "${ARGS[@]}" --reclaim-mmap-source >on.txt 2>/dev/null cmp off.txt on.txt && echo "PASS: byte-identical output" ``` ```bash bash scripts/get-wikitext-2.sh # ~280 KB WIKI=wikitext-2-raw/wiki.test.raw ./build/bin/llama-perplexity -m "$MODEL" -f "$WIKI" --chunks 64 -t "$T" ./build/bin/llama-perplexity -m "$MODEL" -f "$WIKI" --chunks 64 -t "$T" --reclaim-mmap-source ``` ```bash /usr/bin/time -v ./build/bin/llama-completion "${ARGS[@]}" 2>&1 >/dev/null | grep -E "Maximum resident|Major .* page faults" /usr/bin/time -v ./build/bin/llama-completion "${ARGS[@]}" --reclaim-mmap-source 2>&1 >/dev/null | grep -E "Maximum resident|Major .* page faults" ``` Observed peak RSS (`-c 4096`), `Major page faults = 0` in every run: | model | quant / arch | OFF | ON | saving | CPU_REPACK buffer | |---|---|---:|---:|---:|---:| | Llama-3.2-3B | Q4_0 / dense | 3.73 GiB | 2.30 GiB | −1.43 GiB (38%) | 1.47 GiB | | DeepSeek-V2-Lite | Q4_K_M / MoE | 16.19 GiB | 10.75 GiB | −5.44 GiB (34%) | 5.58 GiB | | Qwen3-30B-A3B | Q4_K_M / MoE | ~34 GiB | ~21 GiB | −13.0 GiB (37%) | 13.4 GiB | The saving tracks the CPU_REPACK buffer size (the duplicated source). `majflt=0` confirms the dropped pages are never re-faulted on the hot path. ```bash ./build/bin/llama-completion "${ARGS[@]}" --no-mmap --reclaim-mmap-source >nommap.txt 2>/dev/null ./build/bin/llama-completion "${ARGS[@]}" --mlock --reclaim-mmap-source >mlock.txt 2>/dev/null cmp off.txt nommap.txt && cmp off.txt mlock.txt && echo "PASS: inert under --no-mmap and --mlock" ``` - Behavior is unchanged by construction — the flag only `madvise(MADV_DONTNEED)`s the read-only file-backed source after it has been copied out; any access re-faults identical bytes from the file. - Linux-only (`#if defined(MADV_DONTNEED)`); compiles to a no-op elsewhere. - Detailed before/after VMA breakdown (`model_mmap` 17.6 → 4.3 GiB on Qwen3) via `/proc//smaps`. --- common/arg.cpp | 8 ++++++++ common/common.cpp | 1 + common/common.h | 1 + include/llama.h | 1 + src/llama-model-loader.cpp | 32 ++++++++++++++++++++++++++++++++ src/llama-model-loader.h | 2 ++ src/llama-model.cpp | 1 + src/llama-quant.cpp | 2 +- src/llama.cpp | 2 +- 9 files changed, 48 insertions(+), 2 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 1ffaf704858..b7adba447d9 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2268,6 +2268,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.use_mmap = value; } ).set_env("LLAMA_ARG_MMAP")); + add_opt(common_arg( + {"--reclaim-mmap-source"}, + "drop mmap'd source pages from RSS after they are copied out of the mmap into a separate buffer " + "(e.g. CPU weight repacking); saves memory for repack-eligible quants under mmap, no effect with --no-mmap or --mlock (Linux only)", + [](common_params & params) { + params.reclaim_mmap_source = true; + } + ).set_env("LLAMA_ARG_RECLAIM_MMAP_SOURCE")); add_opt(common_arg( {"-dio", "--direct-io"}, {"-ndio", "--no-direct-io"}, diff --git a/common/common.cpp b/common/common.cpp index b6a7626f2a1..35cd4269278 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1532,6 +1532,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { mparams.check_tensors = params.check_tensors; mparams.use_extra_bufts = !params.no_extra_bufts; mparams.no_host = params.no_host; + mparams.reclaim_mmap_source = params.reclaim_mmap_source; if (params.kv_overrides.empty()) { mparams.kv_overrides = NULL; diff --git a/common/common.h b/common/common.h index 13f387271d8..3f68481e0f7 100644 --- a/common/common.h +++ b/common/common.h @@ -551,6 +551,7 @@ struct common_params { bool use_mmap = true; // enable mmap to use filesystem cache bool use_direct_io = false; // read from disk without buffering bool use_mlock = false; // use mlock to keep model in memory + bool reclaim_mmap_source = false; // drop mmap'd source pages after they are copied out of the mmap (Linux only) bool verbose_prompt = false; // print prompt tokens before generation bool display_prompt = true; // print prompt before generation bool no_kv_offload = false; // disable KV offloading diff --git a/include/llama.h b/include/llama.h index 9f78aa9a056..f0c1a0d9711 100644 --- a/include/llama.h +++ b/include/llama.h @@ -324,6 +324,7 @@ extern "C" { bool use_extra_bufts; // use extra buffer types (used for weight repacking) bool no_host; // bypass host buffer allowing extra buffers to be used bool no_alloc; // only load metadata and simulate memory allocations + bool reclaim_mmap_source; // drop mmap'd source pages after they are copied out of the mmap into a separate buffer (Linux only) }; struct llama_sampler_seq_config { diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 4d7b11067c9..b30b26b6b86 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -5,6 +5,11 @@ #include "gguf.h" #include "llama-hparams.h" +#ifdef __linux__ +#include // madvise, MADV_DONTNEED +#include // sysconf, _SC_PAGESIZE +#endif + #include #include #include @@ -518,6 +523,7 @@ llama_model_loader::llama_model_loader( bool use_direct_io, bool check_tensors, bool no_alloc, + bool reclaim_mmap_source, const llama_model_kv_override * param_overrides_p, const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) : metadata(meta), set_tensor_data(set_tensor_data), set_tensor_data_ud(set_tensor_data_ud) { @@ -816,6 +822,7 @@ llama_model_loader::llama_model_loader( this->use_direct_io = use_direct_io; this->check_tensors = check_tensors; this->no_alloc = no_alloc; + this->reclaim_mmap_source = reclaim_mmap_source; } std::string llama_model_loader::get_arch_name() const { @@ -1518,6 +1525,11 @@ bool llama_model_loader::load_all_data( ggml_backend_name(upload_backend)); } +#if defined(MADV_DONTNEED) + // Page size for reclaim_mmap_source; queried once and reused for every repacked tensor. + const uintptr_t reclaim_page_size = (uintptr_t) sysconf(_SC_PAGESIZE); +#endif + for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { const auto * weight = get_weight(ggml_get_name(cur)); if (weight == nullptr) { @@ -1560,6 +1572,26 @@ bool llama_model_loader::load_all_data( mmap_used.second = std::max(mmap_used.second, weight->offs + n_size); } else { ggml_backend_tensor_set(cur, data, 0, n_size); + +#if defined(MADV_DONTNEED) + // This tensor's data was copied from the mmap into its own buffer (e.g. + // CPU_REPACK), so the source pages [data, data + n_size) are now dormant - + // compute reads only from the copy. madvise drops them from RSS; the + // read-only file mapping stays valid and any later access (e.g. a + // concurrent --check-tensors validation thread) simply re-faults the + // original bytes from disk - safe, never wrong data. Skipped under mlock + // (lmlocks != nullptr), where the user wants the file pages resident. The + // range is rounded inward to whole pages so we never evict a page shared + // with a neighbouring zero-copy tensor. Reclaims the duplication reported + // upstream in #16761. + if (reclaim_mmap_source && lmlocks == nullptr) { + const uintptr_t beg = ((uintptr_t) data + reclaim_page_size - 1) & ~(reclaim_page_size - 1); + const uintptr_t end = ((uintptr_t) data + n_size) & ~(reclaim_page_size - 1); + if (end > beg) { + madvise((void *) beg, end - beg, MADV_DONTNEED); + } + } +#endif } } else { const auto & file = files.at(weight->idx); diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index c476026d3e5..72147d70f9d 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -79,6 +79,7 @@ struct llama_model_loader { bool use_direct_io = false; bool check_tensors; bool no_alloc; + bool reclaim_mmap_source = false; llama_files files; llama_ftype ftype; @@ -130,6 +131,7 @@ struct llama_model_loader { bool use_direct_io, bool check_tensors, bool no_alloc, + bool reclaim_mmap_source, const llama_model_kv_override * param_overrides_p, const llama_model_tensor_buft_override * param_tensor_buft_overrides_p); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index bc7a83b15f5..8910873f046 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2208,6 +2208,7 @@ llama_model_params llama_model_default_params() { /*.use_extra_bufts =*/ true, /*.no_host =*/ false, /*.no_alloc =*/ false, + /*.reclaim_mmap_source =*/ false, }; return result; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 43e05c3d56f..0be8cd2283c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -879,7 +879,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const llama_model_kv_override * kv_overrides = params->kv_overrides; std::vector splits = {}; llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr, - fname_inp, splits, /*file*/ nullptr, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); + fname_inp, splits, /*file*/ nullptr, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, /*reclaim_mmap_source*/ false, kv_overrides, nullptr); ml.init_mappings(false); // no prefetching auto mparams = llama_model_default_params(); diff --git a/src/llama.cpp b/src/llama.cpp index a67fa8039a4..6ef3a1f4735 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -280,7 +280,7 @@ static std::pair llama_model_load(struct gguf_context * meta const std::string & fname, std::vector & splits, FILE * file, llama_model_params & params) { try { llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, file, params.use_mmap, params.use_direct_io, - params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides); + params.check_tensors, params.no_alloc, params.reclaim_mmap_source, params.kv_overrides, params.tensor_buft_overrides); ml.print_info(); std::unique_ptr model_ptr(llama_model_create(ml, params));