diff --git a/common/arg.cpp b/common/arg.cpp index 1ffaf704858..b7adba447d9 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2268,6 +2268,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.use_mmap = value; } ).set_env("LLAMA_ARG_MMAP")); + add_opt(common_arg( + {"--reclaim-mmap-source"}, + "drop mmap'd source pages from RSS after they are copied out of the mmap into a separate buffer " + "(e.g. CPU weight repacking); saves memory for repack-eligible quants under mmap, no effect with --no-mmap or --mlock (Linux only)", + [](common_params & params) { + params.reclaim_mmap_source = true; + } + ).set_env("LLAMA_ARG_RECLAIM_MMAP_SOURCE")); add_opt(common_arg( {"-dio", "--direct-io"}, {"-ndio", "--no-direct-io"}, diff --git a/common/common.cpp b/common/common.cpp index b6a7626f2a1..35cd4269278 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1532,6 +1532,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { mparams.check_tensors = params.check_tensors; mparams.use_extra_bufts = !params.no_extra_bufts; mparams.no_host = params.no_host; + mparams.reclaim_mmap_source = params.reclaim_mmap_source; if (params.kv_overrides.empty()) { mparams.kv_overrides = NULL; diff --git a/common/common.h b/common/common.h index 13f387271d8..3f68481e0f7 100644 --- a/common/common.h +++ b/common/common.h @@ -551,6 +551,7 @@ struct common_params { bool use_mmap = true; // enable mmap to use filesystem cache bool use_direct_io = false; // read from disk without buffering bool use_mlock = false; // use mlock to keep model in memory + bool reclaim_mmap_source = false; // drop mmap'd source pages after they are copied out of the mmap (Linux only) bool verbose_prompt = false; // print prompt tokens before generation bool display_prompt = true; // print prompt before generation bool no_kv_offload = false; // disable KV offloading diff --git a/include/llama.h b/include/llama.h index 9f78aa9a056..f0c1a0d9711 100644 --- a/include/llama.h +++ b/include/llama.h @@ -324,6 +324,7 @@ extern "C" { bool use_extra_bufts; // use extra buffer types (used for weight repacking) bool no_host; // bypass host buffer allowing extra buffers to be used bool no_alloc; // only load metadata and simulate memory allocations + bool reclaim_mmap_source; // drop mmap'd source pages after they are copied out of the mmap into a separate buffer (Linux only) }; struct llama_sampler_seq_config { diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 4d7b11067c9..b30b26b6b86 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -5,6 +5,11 @@ #include "gguf.h" #include "llama-hparams.h" +#ifdef __linux__ +#include // madvise, MADV_DONTNEED +#include // sysconf, _SC_PAGESIZE +#endif + #include #include #include @@ -518,6 +523,7 @@ llama_model_loader::llama_model_loader( bool use_direct_io, bool check_tensors, bool no_alloc, + bool reclaim_mmap_source, const llama_model_kv_override * param_overrides_p, const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) : metadata(meta), set_tensor_data(set_tensor_data), set_tensor_data_ud(set_tensor_data_ud) { @@ -816,6 +822,7 @@ llama_model_loader::llama_model_loader( this->use_direct_io = use_direct_io; this->check_tensors = check_tensors; this->no_alloc = no_alloc; + this->reclaim_mmap_source = reclaim_mmap_source; } std::string llama_model_loader::get_arch_name() const { @@ -1518,6 +1525,11 @@ bool llama_model_loader::load_all_data( ggml_backend_name(upload_backend)); } +#if defined(MADV_DONTNEED) + // Page size for reclaim_mmap_source; queried once and reused for every repacked tensor. + const uintptr_t reclaim_page_size = (uintptr_t) sysconf(_SC_PAGESIZE); +#endif + for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { const auto * weight = get_weight(ggml_get_name(cur)); if (weight == nullptr) { @@ -1560,6 +1572,26 @@ bool llama_model_loader::load_all_data( mmap_used.second = std::max(mmap_used.second, weight->offs + n_size); } else { ggml_backend_tensor_set(cur, data, 0, n_size); + +#if defined(MADV_DONTNEED) + // This tensor's data was copied from the mmap into its own buffer (e.g. + // CPU_REPACK), so the source pages [data, data + n_size) are now dormant - + // compute reads only from the copy. madvise drops them from RSS; the + // read-only file mapping stays valid and any later access (e.g. a + // concurrent --check-tensors validation thread) simply re-faults the + // original bytes from disk - safe, never wrong data. Skipped under mlock + // (lmlocks != nullptr), where the user wants the file pages resident. The + // range is rounded inward to whole pages so we never evict a page shared + // with a neighbouring zero-copy tensor. Reclaims the duplication reported + // upstream in #16761. + if (reclaim_mmap_source && lmlocks == nullptr) { + const uintptr_t beg = ((uintptr_t) data + reclaim_page_size - 1) & ~(reclaim_page_size - 1); + const uintptr_t end = ((uintptr_t) data + n_size) & ~(reclaim_page_size - 1); + if (end > beg) { + madvise((void *) beg, end - beg, MADV_DONTNEED); + } + } +#endif } } else { const auto & file = files.at(weight->idx); diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index c476026d3e5..72147d70f9d 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -79,6 +79,7 @@ struct llama_model_loader { bool use_direct_io = false; bool check_tensors; bool no_alloc; + bool reclaim_mmap_source = false; llama_files files; llama_ftype ftype; @@ -130,6 +131,7 @@ struct llama_model_loader { bool use_direct_io, bool check_tensors, bool no_alloc, + bool reclaim_mmap_source, const llama_model_kv_override * param_overrides_p, const llama_model_tensor_buft_override * param_tensor_buft_overrides_p); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index bc7a83b15f5..8910873f046 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2208,6 +2208,7 @@ llama_model_params llama_model_default_params() { /*.use_extra_bufts =*/ true, /*.no_host =*/ false, /*.no_alloc =*/ false, + /*.reclaim_mmap_source =*/ false, }; return result; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 43e05c3d56f..0be8cd2283c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -879,7 +879,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const llama_model_kv_override * kv_overrides = params->kv_overrides; std::vector splits = {}; llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr, - fname_inp, splits, /*file*/ nullptr, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); + fname_inp, splits, /*file*/ nullptr, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, /*reclaim_mmap_source*/ false, kv_overrides, nullptr); ml.init_mappings(false); // no prefetching auto mparams = llama_model_default_params(); diff --git a/src/llama.cpp b/src/llama.cpp index a67fa8039a4..6ef3a1f4735 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -280,7 +280,7 @@ static std::pair llama_model_load(struct gguf_context * meta const std::string & fname, std::vector & splits, FILE * file, llama_model_params & params) { try { llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, file, params.use_mmap, params.use_direct_io, - params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides); + params.check_tensors, params.no_alloc, params.reclaim_mmap_source, params.kv_overrides, params.tensor_buft_overrides); ml.print_info(); std::unique_ptr model_ptr(llama_model_create(ml, params));