Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion examples/cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ Context Options:
then threads will be set to the number of CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
--max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
graph splitting; -1 auto-detects free VRAM minus 1 GiB
graph splitting; a negative value auto-detects free VRAM, sparing the
specified value (e.g. -0.5 will keep at least 0.5 GiB free)
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
when needed
Expand Down
2 changes: 1 addition & 1 deletion examples/common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ ArgOptions SDContextParams::get_options() {
options.float_options = {
{"",
"--max-vram",
"maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; -1 auto-detects free VRAM minus 1 GiB",
"maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables graph splitting; a negative value auto-detects free VRAM, sparing the specified value (e.g. -0.5 will keep at least 0.5 GiB free)",
&max_vram},
};

Expand Down
3 changes: 2 additions & 1 deletion examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,8 @@ Context Options:
then threads will be set to the number of CPU physical cores
--chroma-t5-mask-pad <int> t5 mask pad size of chroma
--max-vram <float> maximum VRAM budget in GiB for graph-cut segmented execution. 0 disables
graph splitting; -1 auto-detects free VRAM minus 1 GiB
graph splitting; a negative value auto-detects free VRAM, sparing the
specified value (e.g. -0.5 will keep at least 0.5 GiB free)
--force-sdxl-vae-conv-scale force use of conv scale on sdxl vae
--offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM
when needed
Expand Down
25 changes: 13 additions & 12 deletions src/ggml_graph_cut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
namespace sd::ggml_graph_cut {

static constexpr double MAX_VRAM_BYTES_PER_GIB = 1024.0 * 1024.0 * 1024.0;
static constexpr size_t MAX_VRAM_AUTO_RESERVE_BYTES = 1024ULL * 1024ULL * 1024ULL;

static std::string graph_cut_tensor_display_name(const ggml_tensor* tensor) {
if (tensor == nullptr) {
Expand Down Expand Up @@ -93,45 +92,47 @@ namespace sd::ggml_graph_cut {
return static_cast<float>(static_cast<double>(max_vram_bytes) / MAX_VRAM_BYTES_PER_GIB);
}

static size_t resolve_auto_max_vram_bytes(ggml_backend_t backend) {
static size_t resolve_auto_max_vram_bytes(float spare_vram, ggml_backend_t backend) {
if (backend == nullptr) {
LOG_WARN("--max-vram -1 requested, but no backend is available; disabling graph splitting");
LOG_WARN("--max-vram < 0 requested, but no backend is available; disabling graph splitting");
return 0;
}

ggml_backend_dev_t dev = ggml_backend_get_device(backend);
if (dev == nullptr) {
LOG_WARN("--max-vram -1 requested, but no backend device is available; disabling graph splitting");
LOG_WARN("--max-vram < 0 requested, but no backend device is available; disabling graph splitting");
return 0;
}
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
LOG_WARN("--max-vram -1 requested, but the main backend is CPU; disabling graph splitting");
LOG_WARN("--max-vram < 0 requested, but the main backend is CPU; disabling graph splitting");
return 0;
}

size_t free_vram = 0;
size_t total_vram = 0;
ggml_backend_dev_memory(dev, &free_vram, &total_vram);
size_t spare_bytes = static_cast<size_t>(MAX_VRAM_BYTES_PER_GIB * spare_vram);

if (free_vram <= MAX_VRAM_AUTO_RESERVE_BYTES) {
LOG_WARN("--max-vram -1 requested, but free VRAM is %.2f GiB; reserving 1.00 GiB leaves no graph budget",
free_vram / MAX_VRAM_BYTES_PER_GIB);
if (free_vram <= spare_bytes) {
LOG_WARN("--max-vram < 0 requested, but free VRAM is %.2f GiB; reserving %.2f GiB leaves no graph budget",
free_vram / MAX_VRAM_BYTES_PER_GIB, spare_vram);
return 0;
}

const size_t max_vram_bytes = free_vram - MAX_VRAM_AUTO_RESERVE_BYTES;
LOG_INFO("--max-vram -1 auto-detected %.2f GiB free VRAM (%.2f GiB total), reserving 1.00 GiB; using %.2f GiB",
const size_t max_vram_bytes = free_vram - spare_bytes;
LOG_INFO("--max-vram < 0 auto-detected %.2f GiB free VRAM (%.2f GiB total), reserving %.2f GiB; using %.2f GiB",
free_vram / MAX_VRAM_BYTES_PER_GIB,
total_vram / MAX_VRAM_BYTES_PER_GIB,
spare_vram,
max_vram_bytes / MAX_VRAM_BYTES_PER_GIB);
return max_vram_bytes;
}

float resolve_max_vram_gib(float max_vram, ggml_backend_t backend) {
if (max_vram != -1.f) {
if (max_vram >= 0.f) {
return max_vram;
}
return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(backend));
return max_vram_bytes_to_gib(resolve_auto_max_vram_bytes(-max_vram, backend));
}

static Segment make_segment_seed(const Plan& plan,
Expand Down
Loading