From 9b9d109f6f73908a14a5d996694bcf065eafcb76 Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sat, 16 May 2026 15:52:31 -0300 Subject: [PATCH] feat: make negative max_vram control the amount of spare vram -1 still means 1 GiB, but e.g. -0.2 would spare only 0.2 GiB. --- examples/cli/README.md | 3 ++- examples/common/common.cpp | 2 +- examples/server/README.md | 3 ++- src/ggml_graph_cut.cpp | 25 +++++++++++++------------ 4 files changed, 18 insertions(+), 15 deletions(-) diff --git a/examples/cli/README.md b/examples/cli/README.md index 77cf46320..d149d210f 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -55,7 +55,8 @@ Context Options: then threads will be set to the number of CPU physical cores --chroma-t5-mask-pad t5 mask pad size of chroma --max-vram maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables - graph splitting; -1 auto-detects free VRAM minus 1 GiB + graph splitting; a negative value auto-detects free VRAM, sparing the + specified value (e.g. -0.5 will keep at least 0.5 GiB free) --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed diff --git a/examples/common/common.cpp b/examples/common/common.cpp index dd4581eb0..85c03b412 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -413,7 +413,7 @@ ArgOptions SDContextParams::get_options() { options.float_options = { {"", "--max-vram", - "maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; -1 auto-detects free VRAM minus 1 GiB", + "maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; a negative value auto-detects free VRAM, sparing the specified value (e.g. -0.5 will keep at least 0.5 GiB free)", &max_vram}, }; diff --git a/examples/server/README.md b/examples/server/README.md index cdaac8c64..194cab04e 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -157,7 +157,8 @@ Context Options: then threads will be set to the number of CPU physical cores --chroma-t5-mask-pad t5 mask pad size of chroma --max-vram maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables - graph splitting; -1 auto-detects free VRAM minus 1 GiB + graph splitting; a negative value auto-detects free VRAM, sparing the + specified value (e.g. -0.5 will keep at least 0.5 GiB free) --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed diff --git a/src/ggml_graph_cut.cpp b/src/ggml_graph_cut.cpp index d07e08be7..0100a3d25 100644 --- a/src/ggml_graph_cut.cpp +++ b/src/ggml_graph_cut.cpp @@ -17,7 +17,6 @@ namespace sd::ggml_graph_cut { static constexpr double MAX_VRAM_BYTES_PER_GIB = 1024.0 * 1024.0 * 1024.0; - static constexpr size_t MAX_VRAM_AUTO_RESERVE_BYTES = 1024ULL * 1024ULL * 1024ULL; static std::string graph_cut_tensor_display_name(const ggml_tensor* tensor) { if (tensor == nullptr) { @@ -93,45 +92,47 @@ namespace sd::ggml_graph_cut { return static_cast(static_cast(max_vram_bytes) / MAX_VRAM_BYTES_PER_GIB); } - static size_t resolve_auto_max_vram_bytes(ggml_backend_t backend) { + static size_t resolve_auto_max_vram_bytes(float spare_vram, ggml_backend_t backend) { if (backend == nullptr) { - LOG_WARN("--max-vram -1 requested, but no backend is available; disabling graph splitting"); + LOG_WARN("--max-vram < 0 requested, but no backend is available; disabling graph splitting"); return 0; } ggml_backend_dev_t dev = ggml_backend_get_device(backend); if (dev == nullptr) { - LOG_WARN("--max-vram -1 requested, but no backend device is available; disabling graph splitting"); + LOG_WARN("--max-vram < 0 requested, but no backend device is available; disabling graph splitting"); return 0; } if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) { - LOG_WARN("--max-vram -1 requested, but the main backend is CPU; disabling graph splitting"); + LOG_WARN("--max-vram < 0 requested, but the main backend is CPU; disabling graph splitting"); return 0; } size_t free_vram = 0; size_t total_vram = 0; ggml_backend_dev_memory(dev, &free_vram, &total_vram); + size_t spare_bytes = static_cast(MAX_VRAM_BYTES_PER_GIB * spare_vram); - if (free_vram <= MAX_VRAM_AUTO_RESERVE_BYTES) { - LOG_WARN("--max-vram -1 requested, but free VRAM is %.2f GiB; reserving 1.00 GiB leaves no graph budget", - free_vram / MAX_VRAM_BYTES_PER_GIB); + if (free_vram <= spare_bytes) { + LOG_WARN("--max-vram < 0 requested, but free VRAM is %.2f GiB; reserving %.2f GiB leaves no graph budget", + free_vram / MAX_VRAM_BYTES_PER_GIB, spare_vram); return 0; } - const size_t max_vram_bytes = free_vram - MAX_VRAM_AUTO_RESERVE_BYTES; - LOG_INFO("--max-vram -1 auto-detected %.2f GiB free VRAM (%.2f GiB total), reserving 1.00 GiB; using %.2f GiB", + const size_t max_vram_bytes = free_vram - spare_bytes; + LOG_INFO("--max-vram < 0 auto-detected %.2f GiB free VRAM (%.2f GiB total), reserving %.2f GiB; using %.2f GiB", free_vram / MAX_VRAM_BYTES_PER_GIB, total_vram / MAX_VRAM_BYTES_PER_GIB, + spare_vram, max_vram_bytes / MAX_VRAM_BYTES_PER_GIB); return max_vram_bytes; } float resolve_max_vram_gib(float max_vram, ggml_backend_t backend) { - if (max_vram != -1.f) { + if (max_vram >= 0.f) { return max_vram; } - return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(backend)); + return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(-max_vram, backend)); } static Segment make_segment_seed(const Plan& plan,