From aba63239f0cc796507a4022b85c96c53c3054952 Mon Sep 17 00:00:00 2001
From: Mark Ko <markkobo@gmail.com>
Date: Fri, 5 Jun 2026 00:39:52 +0000
Subject: [PATCH] fix - model-loader : add --reclaim-mmap-source to drop
 dormant mmap pages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After a tensor is copied out of the mmap into a separate buffer (e.g. CPU
weight repacking), the read-only file-backed source pages are dormant but
still sit in the RSS.

With --reclaim-mmap-source,

madvise(MADV_DONTNEED) drops them; the mapping stays valid and re-faults from the file if ever
touched. Linux only, flag-gated, skipped under --mlock. The range is rounded
inward to whole pages. Addresses #16761.

default OFF, but since it's a bug fix for issue #16761 (https://github.com/ggml-org/llama.cpp/issues/16761)

So I'm happy to flip default to ON and/or use a ENV VAR for clean prepared-to-remove variable.

```bash
./build/bin/llama-completion "${ARGS[@]}"                       >off.txt 2>/dev/null
./build/bin/llama-completion "${ARGS[@]}" --reclaim-mmap-source >on.txt  2>/dev/null
cmp off.txt on.txt && echo "PASS: byte-identical output"
```

```bash
bash scripts/get-wikitext-2.sh   # ~280 KB
WIKI=wikitext-2-raw/wiki.test.raw
./build/bin/llama-perplexity -m "$MODEL" -f "$WIKI" --chunks 64 -t "$T"
./build/bin/llama-perplexity -m "$MODEL" -f "$WIKI" --chunks 64 -t "$T" --reclaim-mmap-source
```

```bash
/usr/bin/time -v ./build/bin/llama-completion "${ARGS[@]}"                       2>&1 >/dev/null | grep -E "Maximum resident|Major .* page faults"
/usr/bin/time -v ./build/bin/llama-completion "${ARGS[@]}" --reclaim-mmap-source 2>&1 >/dev/null | grep -E "Maximum resident|Major .* page faults"
```

Observed peak RSS (`-c 4096`), `Major page faults = 0` in every run:

| model | quant / arch | OFF | ON | saving | CPU_REPACK buffer |
|---|---|---:|---:|---:|---:|
| Llama-3.2-3B | Q4_0 / dense | 3.73 GiB | 2.30 GiB | −1.43 GiB (38%) | 1.47 GiB |
| DeepSeek-V2-Lite | Q4_K_M / MoE | 16.19 GiB | 10.75 GiB | −5.44 GiB (34%) | 5.58 GiB |
| Qwen3-30B-A3B | Q4_K_M / MoE | ~34 GiB | ~21 GiB | −13.0 GiB (37%) | 13.4 GiB |

The saving tracks the CPU_REPACK buffer size (the duplicated source). `majflt=0`
confirms the dropped pages are never re-faulted on the hot path.

```bash
./build/bin/llama-completion "${ARGS[@]}" --no-mmap --reclaim-mmap-source >nommap.txt 2>/dev/null
./build/bin/llama-completion "${ARGS[@]}" --mlock   --reclaim-mmap-source >mlock.txt  2>/dev/null
cmp off.txt nommap.txt && cmp off.txt mlock.txt && echo "PASS: inert under --no-mmap and --mlock"
```

- Behavior is unchanged by construction — the flag only `madvise(MADV_DONTNEED)`s
  the read-only file-backed source after it has been copied out; any access
  re-faults identical bytes from the file.
- Linux-only (`#if defined(MADV_DONTNEED)`); compiles to a no-op elsewhere.
- Detailed before/after VMA breakdown (`model_mmap` 17.6 → 4.3 GiB on Qwen3) via
  `/proc/<pid>/smaps`.
---
 common/arg.cpp             |  8 ++++++++
 common/common.cpp          |  1 +
 common/common.h            |  1 +
 include/llama.h            |  1 +
 src/llama-model-loader.cpp | 32 ++++++++++++++++++++++++++++++++
 src/llama-model-loader.h   |  2 ++
 src/llama-model.cpp        |  1 +
 src/llama-quant.cpp        |  2 +-
 src/llama.cpp              |  2 +-
 9 files changed, 48 insertions(+), 2 deletions(-)
diff --git a/common/arg.cpp b/common/arg.cpp
index 1ffaf704858..b7adba447d9 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2268,6 +2268,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.use_mmap = value;
         }
     ).set_env("LLAMA_ARG_MMAP"));
+    add_opt(common_arg(
+        {"--reclaim-mmap-source"},
+        "drop mmap'd source pages from RSS after they are copied out of the mmap into a separate buffer "
+        "(e.g. CPU weight repacking); saves memory for repack-eligible quants under mmap, no effect with --no-mmap or --mlock (Linux only)",
+        [](common_params & params) {
+            params.reclaim_mmap_source = true;
+        }
+    ).set_env("LLAMA_ARG_RECLAIM_MMAP_SOURCE"));
     add_opt(common_arg(
         {"-dio", "--direct-io"},
         {"-ndio", "--no-direct-io"},
diff --git a/common/common.cpp b/common/common.cpp
index b6a7626f2a1..35cd4269278 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1532,6 +1532,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
     mparams.check_tensors   = params.check_tensors;
     mparams.use_extra_bufts = !params.no_extra_bufts;
     mparams.no_host         = params.no_host;
+    mparams.reclaim_mmap_source = params.reclaim_mmap_source;
 
     if (params.kv_overrides.empty()) {
         mparams.kv_overrides = NULL;
diff --git a/common/common.h b/common/common.h
index 13f387271d8..3f68481e0f7 100644
--- a/common/common.h
+++ b/common/common.h
@@ -551,6 +551,7 @@ struct common_params {
     bool use_mmap          = true;  // enable mmap to use filesystem cache
     bool use_direct_io     = false; // read from disk without buffering
     bool use_mlock         = false; // use mlock to keep model in memory
+    bool reclaim_mmap_source = false; // drop mmap'd source pages after they are copied out of the mmap (Linux only)
     bool verbose_prompt    = false; // print prompt tokens before generation
     bool display_prompt    = true;  // print prompt before generation
     bool no_kv_offload     = false; // disable KV offloading
diff --git a/include/llama.h b/include/llama.h
index 9f78aa9a056..f0c1a0d9711 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -324,6 +324,7 @@ extern "C" {
         bool use_extra_bufts; // use extra buffer types (used for weight repacking)
         bool no_host;         // bypass host buffer allowing extra buffers to be used
         bool no_alloc;        // only load metadata and simulate memory allocations
+        bool reclaim_mmap_source; // drop mmap'd source pages after they are copied out of the mmap into a separate buffer (Linux only)
     };
 
     struct llama_sampler_seq_config {
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 4d7b11067c9..b30b26b6b86 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -5,6 +5,11 @@
 #include "gguf.h"
 #include "llama-hparams.h"
 
+#ifdef __linux__
+#include <sys/mman.h> // madvise, MADV_DONTNEED
+#include <unistd.h>   // sysconf, _SC_PAGESIZE
+#endif
+
 #include <algorithm>
 #include <array>
 #include <cinttypes>
@@ -518,6 +523,7 @@ llama_model_loader::llama_model_loader(
         bool use_direct_io,
         bool check_tensors,
         bool no_alloc,
+        bool reclaim_mmap_source,
         const llama_model_kv_override * param_overrides_p,
         const llama_model_tensor_buft_override * param_tensor_buft_overrides_p)
         : metadata(meta), set_tensor_data(set_tensor_data), set_tensor_data_ud(set_tensor_data_ud) {
@@ -816,6 +822,7 @@ llama_model_loader::llama_model_loader(
     this->use_direct_io = use_direct_io;
     this->check_tensors = check_tensors;
     this->no_alloc = no_alloc;
+    this->reclaim_mmap_source = reclaim_mmap_source;
 }
 
 std::string llama_model_loader::get_arch_name() const {
@@ -1518,6 +1525,11 @@ bool llama_model_loader::load_all_data(
             ggml_backend_name(upload_backend));
     }
 
+#if defined(MADV_DONTNEED)
+    // Page size for reclaim_mmap_source; queried once and reused for every repacked tensor.
+    const uintptr_t reclaim_page_size = (uintptr_t) sysconf(_SC_PAGESIZE);
+#endif
+
     for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
         const auto * weight = get_weight(ggml_get_name(cur));
         if (weight == nullptr) {
@@ -1560,6 +1572,26 @@ bool llama_model_loader::load_all_data(
                 mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
             } else {
                 ggml_backend_tensor_set(cur, data, 0, n_size);
+
+#if defined(MADV_DONTNEED)
+                // This tensor's data was copied from the mmap into its own buffer (e.g.
+                // CPU_REPACK), so the source pages [data, data + n_size) are now dormant -
+                // compute reads only from the copy. madvise drops them from RSS; the
+                // read-only file mapping stays valid and any later access (e.g. a
+                // concurrent --check-tensors validation thread) simply re-faults the
+                // original bytes from disk - safe, never wrong data. Skipped under mlock
+                // (lmlocks != nullptr), where the user wants the file pages resident. The
+                // range is rounded inward to whole pages so we never evict a page shared
+                // with a neighbouring zero-copy tensor. Reclaims the duplication reported
+                // upstream in #16761.
+                if (reclaim_mmap_source && lmlocks == nullptr) {
+                    const uintptr_t beg = ((uintptr_t) data + reclaim_page_size - 1) & ~(reclaim_page_size - 1);
+                    const uintptr_t end = ((uintptr_t) data + n_size)                & ~(reclaim_page_size - 1);
+                    if (end > beg) {
+                        madvise((void *) beg, end - beg, MADV_DONTNEED);
+                    }
+                }
+#endif
             }
         } else {
             const auto & file = files.at(weight->idx);
diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h
index c476026d3e5..72147d70f9d 100644
--- a/src/llama-model-loader.h
+++ b/src/llama-model-loader.h
@@ -79,6 +79,7 @@ struct llama_model_loader {
     bool use_direct_io = false;
     bool check_tensors;
     bool no_alloc;
+    bool reclaim_mmap_source = false;
 
     llama_files files;
     llama_ftype ftype;
@@ -130,6 +131,7 @@ struct llama_model_loader {
         bool use_direct_io,
         bool check_tensors,
         bool no_alloc,
+        bool reclaim_mmap_source,
         const llama_model_kv_override * param_overrides_p,
         const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index bc7a83b15f5..8910873f046 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2208,6 +2208,7 @@ llama_model_params llama_model_default_params() {
         /*.use_extra_bufts             =*/ true,
         /*.no_host                     =*/ false,
         /*.no_alloc                    =*/ false,
+        /*.reclaim_mmap_source       =*/ false,
     };
 
     return result;
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 43e05c3d56f..0be8cd2283c 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -879,7 +879,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     const llama_model_kv_override * kv_overrides = params->kv_overrides;
     std::vector<std::string> splits = {};
     llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr,
-        fname_inp, splits, /*file*/ nullptr, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
+        fname_inp, splits, /*file*/ nullptr, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, /*reclaim_mmap_source*/ false, kv_overrides, nullptr);
     ml.init_mappings(false); // no prefetching
 
     auto mparams = llama_model_default_params();
diff --git a/src/llama.cpp b/src/llama.cpp
index a67fa8039a4..6ef3a1f4735 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -280,7 +280,7 @@ static std::pair<int, llama_model *> llama_model_load(struct gguf_context * meta
         const std::string & fname, std::vector<std::string> & splits, FILE * file, llama_model_params & params) {
     try {
         llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, file, params.use_mmap, params.use_direct_io,
-            params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
+            params.check_tensors, params.no_alloc, params.reclaim_mmap_source, params.kv_overrides, params.tensor_buft_overrides);
 
         ml.print_info();
         std::unique_ptr<llama_model> model_ptr(llama_model_create(ml, params));