Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2268,6 +2268,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.use_mmap = value;
}
).set_env("LLAMA_ARG_MMAP"));
add_opt(common_arg(
{"--reclaim-mmap-source"},
"drop mmap'd source pages from RSS after they are copied out of the mmap into a separate buffer "
"(e.g. CPU weight repacking); saves memory for repack-eligible quants under mmap, no effect with --no-mmap or --mlock (Linux only)",
[](common_params & params) {
params.reclaim_mmap_source = true;
}
).set_env("LLAMA_ARG_RECLAIM_MMAP_SOURCE"));
add_opt(common_arg(
{"-dio", "--direct-io"},
{"-ndio", "--no-direct-io"},
Expand Down
1 change: 1 addition & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1532,6 +1532,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
mparams.check_tensors = params.check_tensors;
mparams.use_extra_bufts = !params.no_extra_bufts;
mparams.no_host = params.no_host;
mparams.reclaim_mmap_source = params.reclaim_mmap_source;

if (params.kv_overrides.empty()) {
mparams.kv_overrides = NULL;
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,7 @@ struct common_params {
bool use_mmap = true; // enable mmap to use filesystem cache
bool use_direct_io = false; // read from disk without buffering
bool use_mlock = false; // use mlock to keep model in memory
bool reclaim_mmap_source = false; // drop mmap'd source pages after they are copied out of the mmap (Linux only)
bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation
bool no_kv_offload = false; // disable KV offloading
Expand Down
1 change: 1 addition & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,7 @@ extern "C" {
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
bool no_host; // bypass host buffer allowing extra buffers to be used
bool no_alloc; // only load metadata and simulate memory allocations
bool reclaim_mmap_source; // drop mmap'd source pages after they are copied out of the mmap into a separate buffer (Linux only)
};

struct llama_sampler_seq_config {
Expand Down
32 changes: 32 additions & 0 deletions src/llama-model-loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@
#include "gguf.h"
#include "llama-hparams.h"

#ifdef __linux__
#include <sys/mman.h> // madvise, MADV_DONTNEED
#include <unistd.h> // sysconf, _SC_PAGESIZE
#endif

#include <algorithm>
#include <array>
#include <cinttypes>
Expand Down Expand Up @@ -518,6 +523,7 @@ llama_model_loader::llama_model_loader(
bool use_direct_io,
bool check_tensors,
bool no_alloc,
bool reclaim_mmap_source,
const llama_model_kv_override * param_overrides_p,
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p)
: metadata(meta), set_tensor_data(set_tensor_data), set_tensor_data_ud(set_tensor_data_ud) {
Expand Down Expand Up @@ -816,6 +822,7 @@ llama_model_loader::llama_model_loader(
this->use_direct_io = use_direct_io;
this->check_tensors = check_tensors;
this->no_alloc = no_alloc;
this->reclaim_mmap_source = reclaim_mmap_source;
}

std::string llama_model_loader::get_arch_name() const {
Expand Down Expand Up @@ -1518,6 +1525,11 @@ bool llama_model_loader::load_all_data(
ggml_backend_name(upload_backend));
}

#if defined(MADV_DONTNEED)
// Page size for reclaim_mmap_source; queried once and reused for every repacked tensor.
const uintptr_t reclaim_page_size = (uintptr_t) sysconf(_SC_PAGESIZE);
#endif

for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
const auto * weight = get_weight(ggml_get_name(cur));
if (weight == nullptr) {
Expand Down Expand Up @@ -1560,6 +1572,26 @@ bool llama_model_loader::load_all_data(
mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
} else {
ggml_backend_tensor_set(cur, data, 0, n_size);

#if defined(MADV_DONTNEED)
// This tensor's data was copied from the mmap into its own buffer (e.g.
// CPU_REPACK), so the source pages [data, data + n_size) are now dormant -
// compute reads only from the copy. madvise drops them from RSS; the
// read-only file mapping stays valid and any later access (e.g. a
// concurrent --check-tensors validation thread) simply re-faults the
// original bytes from disk - safe, never wrong data. Skipped under mlock
// (lmlocks != nullptr), where the user wants the file pages resident. The
// range is rounded inward to whole pages so we never evict a page shared
// with a neighbouring zero-copy tensor. Reclaims the duplication reported
// upstream in #16761.
if (reclaim_mmap_source && lmlocks == nullptr) {
const uintptr_t beg = ((uintptr_t) data + reclaim_page_size - 1) & ~(reclaim_page_size - 1);
const uintptr_t end = ((uintptr_t) data + n_size) & ~(reclaim_page_size - 1);
if (end > beg) {
madvise((void *) beg, end - beg, MADV_DONTNEED);
}
}
#endif
}
} else {
const auto & file = files.at(weight->idx);
Expand Down
2 changes: 2 additions & 0 deletions src/llama-model-loader.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ struct llama_model_loader {
bool use_direct_io = false;
bool check_tensors;
bool no_alloc;
bool reclaim_mmap_source = false;

llama_files files;
llama_ftype ftype;
Expand Down Expand Up @@ -130,6 +131,7 @@ struct llama_model_loader {
bool use_direct_io,
bool check_tensors,
bool no_alloc,
bool reclaim_mmap_source,
const llama_model_kv_override * param_overrides_p,
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);

Expand Down
1 change: 1 addition & 0 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2208,6 +2208,7 @@ llama_model_params llama_model_default_params() {
/*.use_extra_bufts =*/ true,
/*.no_host =*/ false,
/*.no_alloc =*/ false,
/*.reclaim_mmap_source =*/ false,
};

return result;
Expand Down
2 changes: 1 addition & 1 deletion src/llama-quant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -879,7 +879,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
const llama_model_kv_override * kv_overrides = params->kv_overrides;
std::vector<std::string> splits = {};
llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr,
fname_inp, splits, /*file*/ nullptr, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
fname_inp, splits, /*file*/ nullptr, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, /*reclaim_mmap_source*/ false, kv_overrides, nullptr);
ml.init_mappings(false); // no prefetching

auto mparams = llama_model_default_params();
Expand Down
2 changes: 1 addition & 1 deletion src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ static std::pair<int, llama_model *> llama_model_load(struct gguf_context * meta
const std::string & fname, std::vector<std::string> & splits, FILE * file, llama_model_params & params) {
try {
llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, file, params.use_mmap, params.use_direct_io,
params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
params.check_tensors, params.no_alloc, params.reclaim_mmap_source, params.kv_overrides, params.tensor_buft_overrides);

ml.print_info();
std::unique_ptr<llama_model> model_ptr(llama_model_create(ml, params));
Expand Down