ggml-org · markkobo · Jun 5, 2026
@@ -2268,6 +2268,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.use_mmap = value;
         }
     ).set_env("LLAMA_ARG_MMAP"));
+    add_opt(common_arg(
+        {"--reclaim-mmap-source"},
+        "drop mmap'd source pages from RSS after they are copied out of the mmap into a separate buffer "
+        "(e.g. CPU weight repacking); saves memory for repack-eligible quants under mmap, no effect with --no-mmap or --mlock (Linux only)",
+        [](common_params & params) {
+            params.reclaim_mmap_source = true;
+        }
+    ).set_env("LLAMA_ARG_RECLAIM_MMAP_SOURCE"));
     add_opt(common_arg(
         {"-dio", "--direct-io"},
         {"-ndio", "--no-direct-io"},

@@ -1532,6 +1532,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
     mparams.check_tensors   = params.check_tensors;
     mparams.use_extra_bufts = !params.no_extra_bufts;
     mparams.no_host         = params.no_host;
+    mparams.reclaim_mmap_source = params.reclaim_mmap_source;
 
     if (params.kv_overrides.empty()) {
         mparams.kv_overrides = NULL;

@@ -551,6 +551,7 @@ struct common_params {
     bool use_mmap          = true;  // enable mmap to use filesystem cache
     bool use_direct_io     = false; // read from disk without buffering
     bool use_mlock         = false; // use mlock to keep model in memory
+    bool reclaim_mmap_source = false; // drop mmap'd source pages after they are copied out of the mmap (Linux only)
     bool verbose_prompt    = false; // print prompt tokens before generation
     bool display_prompt    = true;  // print prompt before generation
     bool no_kv_offload     = false; // disable KV offloading

diff --git a/include/llama.h b/include/llama.h
@@ -324,6 +324,7 @@ extern "C" {
         bool use_extra_bufts; // use extra buffer types (used for weight repacking)
         bool no_host;         // bypass host buffer allowing extra buffers to be used
         bool no_alloc;        // only load metadata and simulate memory allocations
+        bool reclaim_mmap_source; // drop mmap'd source pages after they are copied out of the mmap into a separate buffer (Linux only)
     };
 
     struct llama_sampler_seq_config {

@@ -5,6 +5,11 @@
 #include "gguf.h"
 #include "llama-hparams.h"
 
+#ifdef __linux__
+#include <sys/mman.h> // madvise, MADV_DONTNEED
+#include <unistd.h>   // sysconf, _SC_PAGESIZE
+#endif
+
 #include <algorithm>
 #include <array>
 #include <cinttypes>
@@ -518,6 +523,7 @@ llama_model_loader::llama_model_loader(
         bool use_direct_io,
         bool check_tensors,
         bool no_alloc,
+        bool reclaim_mmap_source,
         const llama_model_kv_override * param_overrides_p,
         const llama_model_tensor_buft_override * param_tensor_buft_overrides_p)
         : metadata(meta), set_tensor_data(set_tensor_data), set_tensor_data_ud(set_tensor_data_ud) {
@@ -816,6 +822,7 @@ llama_model_loader::llama_model_loader(
     this->use_direct_io = use_direct_io;
     this->check_tensors = check_tensors;
     this->no_alloc = no_alloc;
+    this->reclaim_mmap_source = reclaim_mmap_source;
 }
 
 std::string llama_model_loader::get_arch_name() const {
@@ -1518,6 +1525,11 @@ bool llama_model_loader::load_all_data(
             ggml_backend_name(upload_backend));
     }
 
+#if defined(MADV_DONTNEED)
+    // Page size for reclaim_mmap_source; queried once and reused for every repacked tensor.
+    const uintptr_t reclaim_page_size = (uintptr_t) sysconf(_SC_PAGESIZE);
+#endif
+
     for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
         const auto * weight = get_weight(ggml_get_name(cur));
         if (weight == nullptr) {
@@ -1560,6 +1572,26 @@ bool llama_model_loader::load_all_data(
                 mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
             } else {
                 ggml_backend_tensor_set(cur, data, 0, n_size);
+
+#if defined(MADV_DONTNEED)
+                // This tensor's data was copied from the mmap into its own buffer (e.g.
+                // CPU_REPACK), so the source pages [data, data + n_size) are now dormant -
+                // compute reads only from the copy. madvise drops them from RSS; the
+                // read-only file mapping stays valid and any later access (e.g. a
+                // concurrent --check-tensors validation thread) simply re-faults the
+                // original bytes from disk - safe, never wrong data. Skipped under mlock
+                // (lmlocks != nullptr), where the user wants the file pages resident. The
+                // range is rounded inward to whole pages so we never evict a page shared
+                // with a neighbouring zero-copy tensor. Reclaims the duplication reported
+                // upstream in #16761.
+                if (reclaim_mmap_source && lmlocks == nullptr) {
+                    const uintptr_t beg = ((uintptr_t) data + reclaim_page_size - 1) & ~(reclaim_page_size - 1);
+                    const uintptr_t end = ((uintptr_t) data + n_size)                & ~(reclaim_page_size - 1);
+                    if (end > beg) {
+                        madvise((void *) beg, end - beg, MADV_DONTNEED);
+                    }
+                }
+#endif
             }
         } else {
             const auto & file = files.at(weight->idx);

@@ -79,6 +79,7 @@ struct llama_model_loader {
     bool use_direct_io = false;
     bool check_tensors;
     bool no_alloc;
+    bool reclaim_mmap_source = false;
 
     llama_files files;
     llama_ftype ftype;
@@ -130,6 +131,7 @@ struct llama_model_loader {
         bool use_direct_io,
         bool check_tensors,
         bool no_alloc,
+        bool reclaim_mmap_source,
         const llama_model_kv_override * param_overrides_p,
         const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
 

@@ -2208,6 +2208,7 @@ llama_model_params llama_model_default_params() {
         /*.use_extra_bufts             =*/ true,
         /*.no_host                     =*/ false,
         /*.no_alloc                    =*/ false,
+        /*.reclaim_mmap_source       =*/ false,
     };
 
     return result;

@@ -879,7 +879,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     const llama_model_kv_override * kv_overrides = params->kv_overrides;
     std::vector<std::string> splits = {};
     llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr,
-        fname_inp, splits, /*file*/ nullptr, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
+        fname_inp, splits, /*file*/ nullptr, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, /*reclaim_mmap_source*/ false, kv_overrides, nullptr);
     ml.init_mappings(false); // no prefetching
 
     auto mparams = llama_model_default_params();

@@ -280,7 +280,7 @@ static std::pair<int, llama_model *> llama_model_load(struct gguf_context * meta
         const std::string & fname, std::vector<std::string> & splits, FILE * file, llama_model_params & params) {
     try {
         llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, file, params.use_mmap, params.use_direct_io,
-            params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
+            params.check_tensors, params.no_alloc, params.reclaim_mmap_source, params.kv_overrides, params.tensor_buft_overrides);
 
         ml.print_info();
         std::unique_ptr<llama_model> model_ptr(llama_model_create(ml, params));