From 9b9d109f6f73908a14a5d996694bcf065eafcb76 Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@yahoo.com>
Date: Sat, 16 May 2026 15:52:31 -0300
Subject: [PATCH] feat: make negative max_vram control the amount of spare vram

-1 still means 1 GiB, but e.g. -0.2 would spare only 0.2 GiB.
---
 examples/cli/README.md     |  3 ++-
 examples/common/common.cpp |  2 +-
 examples/server/README.md  |  3 ++-
 src/ggml_graph_cut.cpp     | 25 +++++++++++++------------
 4 files changed, 18 insertions(+), 15 deletions(-)
diff --git a/examples/cli/README.md b/examples/cli/README.md
index 77cf46320..d149d210f 100644
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@@ -55,7 +55,8 @@ Context Options:
                                            then threads will be set to the number of CPU physical cores
   --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
   --max-vram <float>                       maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
-                                           graph splitting; -1 auto-detects free VRAM minus 1 GiB
+                                           graph splitting; a negative value auto-detects free VRAM, sparing the
+                                           specified value (e.g. -0.5 will keep at least 0.5 GiB free)
   --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
   --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM
                                            when needed
diff --git a/examples/common/common.cpp b/examples/common/common.cpp
index dd4581eb0..85c03b412 100644
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@@ -413,7 +413,7 @@ ArgOptions SDContextParams::get_options() {
     options.float_options = {
         {"",
          "--max-vram",
-         "maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; -1 auto-detects free VRAM minus 1 GiB",
+         "maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; a negative value auto-detects free VRAM, sparing the specified value (e.g. -0.5 will keep at least 0.5 GiB free)",
          &max_vram},
     };
 
diff --git a/examples/server/README.md b/examples/server/README.md
index cdaac8c64..194cab04e 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -157,7 +157,8 @@ Context Options:
                                            then threads will be set to the number of CPU physical cores
   --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
   --max-vram <float>                       maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
-                                           graph splitting; -1 auto-detects free VRAM minus 1 GiB
+                                           graph splitting; a negative value auto-detects free VRAM, sparing the
+                                           specified value (e.g. -0.5 will keep at least 0.5 GiB free)
   --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
   --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM
                                            when needed
diff --git a/src/ggml_graph_cut.cpp b/src/ggml_graph_cut.cpp
index d07e08be7..0100a3d25 100644
--- a/src/ggml_graph_cut.cpp
+++ b/src/ggml_graph_cut.cpp
@@ -17,7 +17,6 @@
 namespace sd::ggml_graph_cut {
 
     static constexpr double MAX_VRAM_BYTES_PER_GIB      = 1024.0 * 1024.0 * 1024.0;
-    static constexpr size_t MAX_VRAM_AUTO_RESERVE_BYTES = 1024ULL * 1024ULL * 1024ULL;
 
     static std::string graph_cut_tensor_display_name(const ggml_tensor* tensor) {
         if (tensor == nullptr) {
@@ -93,45 +92,47 @@ namespace sd::ggml_graph_cut {
         return static_cast<float>(static_cast<double>(max_vram_bytes) / MAX_VRAM_BYTES_PER_GIB);
     }
 
-    static size_t resolve_auto_max_vram_bytes(ggml_backend_t backend) {
+    static size_t resolve_auto_max_vram_bytes(float spare_vram, ggml_backend_t backend) {
         if (backend == nullptr) {
-            LOG_WARN("--max-vram -1 requested, but no backend is available; disabling graph splitting");
+            LOG_WARN("--max-vram < 0 requested, but no backend is available; disabling graph splitting");
             return 0;
         }
 
         ggml_backend_dev_t dev = ggml_backend_get_device(backend);
         if (dev == nullptr) {
-            LOG_WARN("--max-vram -1 requested, but no backend device is available; disabling graph splitting");
+            LOG_WARN("--max-vram < 0 requested, but no backend device is available; disabling graph splitting");
             return 0;
         }
         if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
-            LOG_WARN("--max-vram -1 requested, but the main backend is CPU; disabling graph splitting");
+            LOG_WARN("--max-vram < 0 requested, but the main backend is CPU; disabling graph splitting");
             return 0;
         }
 
         size_t free_vram  = 0;
         size_t total_vram = 0;
         ggml_backend_dev_memory(dev, &free_vram, &total_vram);
+        size_t spare_bytes = static_cast<size_t>(MAX_VRAM_BYTES_PER_GIB * spare_vram);
 
-        if (free_vram <= MAX_VRAM_AUTO_RESERVE_BYTES) {
-            LOG_WARN("--max-vram -1 requested, but free VRAM is %.2f GiB; reserving 1.00 GiB leaves no graph budget",
-                     free_vram / MAX_VRAM_BYTES_PER_GIB);
+        if (free_vram <= spare_bytes) {
+            LOG_WARN("--max-vram < 0 requested, but free VRAM is %.2f GiB; reserving %.2f GiB leaves no graph budget",
+                     free_vram / MAX_VRAM_BYTES_PER_GIB, spare_vram);
             return 0;
         }
 
-        const size_t max_vram_bytes = free_vram - MAX_VRAM_AUTO_RESERVE_BYTES;
-        LOG_INFO("--max-vram -1 auto-detected %.2f GiB free VRAM (%.2f GiB total), reserving 1.00 GiB; using %.2f GiB",
+        const size_t max_vram_bytes = free_vram - spare_bytes;
+        LOG_INFO("--max-vram < 0 auto-detected %.2f GiB free VRAM (%.2f GiB total), reserving %.2f GiB; using %.2f GiB",
                  free_vram / MAX_VRAM_BYTES_PER_GIB,
                  total_vram / MAX_VRAM_BYTES_PER_GIB,
+                 spare_vram,
                  max_vram_bytes / MAX_VRAM_BYTES_PER_GIB);
         return max_vram_bytes;
     }
 
     float resolve_max_vram_gib(float max_vram, ggml_backend_t backend) {
-        if (max_vram != -1.f) {
+        if (max_vram >= 0.f) {
             return max_vram;
         }
-        return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(backend));
+        return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(-max_vram, backend));
     }
 
     static Segment make_segment_seed(const Plan& plan,